nv-ingest-api 25.6.2__tar.gz → 25.6.26.dev20250626__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (164) hide show
  1. {nv_ingest_api-25.6.2/src/nv_ingest_api.egg-info → nv_ingest_api-25.6.26.dev20250626}/PKG-INFO +1 -1
  2. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +50 -14
  3. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +1 -1
  4. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +1 -0
  5. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +1 -0
  6. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +2 -0
  7. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/transform/split_text.py +19 -5
  8. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
  9. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/version.py +0 -8
  10. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/LICENSE +0 -0
  11. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/MANIFEST.in +0 -0
  12. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/README.md +0 -0
  13. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/pyproject.toml +0 -0
  14. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/setup.cfg +0 -0
  15. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/__init__.py +0 -0
  16. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/interface/__init__.py +0 -0
  17. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/interface/extract.py +0 -0
  18. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/interface/mutate.py +0 -0
  19. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/interface/store.py +0 -0
  20. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/interface/transform.py +0 -0
  21. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/interface/utility.py +0 -0
  22. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/__init__.py +0 -0
  23. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
  24. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/enums/common.py +0 -0
  25. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
  26. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
  27. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
  28. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
  29. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  30. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
  31. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
  32. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -0
  33. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/html/__init__.py +0 -0
  34. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/html/html_extractor.py +0 -0
  35. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
  36. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -0
  37. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -0
  38. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
  39. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
  40. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -0
  41. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -0
  42. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
  43. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
  44. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
  45. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
  46. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
  47. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
  48. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -0
  49. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
  50. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
  51. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
  52. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
  53. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  54. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -0
  55. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
  56. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/mutate/__init__.py +0 -0
  57. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
  58. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
  59. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
  60. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
  61. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +0 -0
  62. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
  63. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
  64. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -0
  65. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
  66. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
  67. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
  68. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
  69. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
  70. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -0
  71. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
  72. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
  73. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
  74. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +0 -0
  75. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
  76. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  77. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
  78. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
  79. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -0
  80. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/__init__.py +0 -0
  81. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -0
  82. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
  83. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
  84. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/extract/extract_html_schema.py +0 -0
  85. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
  86. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
  87. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -0
  88. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
  89. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
  90. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
  91. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
  92. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
  93. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
  94. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -0
  95. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
  96. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -0
  97. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
  98. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -0
  99. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
  100. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
  101. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -0
  102. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
  103. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
  104. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
  105. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
  106. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/store/__init__.py +0 -0
  107. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
  108. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
  109. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/transform/__init__.py +0 -0
  110. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
  111. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/internal/transform/embed_text.py +0 -0
  112. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/__init__.py +0 -0
  113. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
  114. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/control_message/validators.py +0 -0
  115. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/converters/__init__.py +0 -0
  116. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
  117. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/converters/containers.py +0 -0
  118. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/converters/datetools.py +0 -0
  119. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/converters/dftools.py +0 -0
  120. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/converters/formats.py +0 -0
  121. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
  122. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
  123. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/detectors/language.py +0 -0
  124. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
  126. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -0
  127. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
  128. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
  129. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
  130. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
  131. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
  132. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
  133. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
  134. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
  135. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/logging/__init__.py +0 -0
  136. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/logging/configuration.py +0 -0
  137. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/message_brokers/__init__.py +0 -0
  138. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
  139. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
  140. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
  141. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
  142. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
  143. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
  144. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
  145. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
  146. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/nim/__init__.py +0 -0
  147. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
  148. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
  149. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/schema/__init__.py +0 -0
  150. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
  151. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/service_clients/__init__.py +0 -0
  152. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
  153. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
  154. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  155. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
  156. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  157. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +0 -0
  158. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
  159. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/system/__init__.py +0 -0
  160. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api/util/system/hardware_info.py +0 -0
  161. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api.egg-info/SOURCES.txt +0 -0
  162. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
  163. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api.egg-info/requires.txt +0 -0
  164. {nv_ingest_api-25.6.2 → nv_ingest_api-25.6.26.dev20250626}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 25.6.2
3
+ Version: 25.6.26.dev20250626
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -5,6 +5,8 @@
5
5
  import logging
6
6
 
7
7
  import pandas as pd
8
+ import functools
9
+ import uuid
8
10
  from typing import Any
9
11
  from typing import Dict
10
12
  from typing import Optional
@@ -21,7 +23,7 @@ logger = logging.getLogger(__name__)
21
23
 
22
24
 
23
25
  @unified_exception_handler
24
- def _update_audio_metadata(row: pd.Series, audio_client: Any, trace_info: Dict) -> Dict:
26
+ def _extract_from_audio(row: pd.Series, audio_client: Any, trace_info: Dict, segment_audio: bool = False) -> Dict:
25
27
  """
26
28
  Modifies the metadata of a row if the conditions for table extraction are met.
27
29
 
@@ -56,24 +58,42 @@ def _update_audio_metadata(row: pd.Series, audio_client: Any, trace_info: Dict)
56
58
  base64_audio = metadata.pop("content")
57
59
  content_metadata = metadata.get("content_metadata", {})
58
60
 
59
- # Only modify if content type is audio
61
+ # Only extract transcript if content type is audio
60
62
  if (content_metadata.get("type") != ContentTypeEnum.AUDIO) or (base64_audio in (None, "")):
61
- return metadata
63
+ return [row.to_list()]
62
64
 
63
- # Modify audio metadata with the result from the inference model
64
- audio_result = audio_client.infer(
65
+ # Get the result from the inference model
66
+ segments, transcript = audio_client.infer(
65
67
  base64_audio,
66
68
  model_name="parakeet",
67
69
  trace_info=trace_info, # traceable_func arg
68
70
  stage_name="audio_extraction",
69
71
  )
70
72
 
71
- row["document_type"] = ContentTypeEnum.AUDIO
72
- audio_metadata = {"audio_transcript": audio_result}
73
- metadata["audio_metadata"] = validate_schema(audio_metadata, AudioMetadataSchema).model_dump()
74
- row["metadata"] = validate_schema(metadata, MetadataSchema).model_dump()
73
+ extracted_data = []
74
+ if segment_audio:
75
+ for segment in segments:
76
+ segment_metadata = metadata.copy()
77
+ audio_metadata = {"audio_transcript": segment["text"]}
78
+ segment_metadata["audio_metadata"] = validate_schema(audio_metadata, AudioMetadataSchema).model_dump()
79
+ segment_metadata["content_metadata"]["start_time"] = segment["start"]
80
+ segment_metadata["content_metadata"]["end_time"] = segment["end"]
81
+
82
+ extracted_data.append(
83
+ [
84
+ ContentTypeEnum.AUDIO,
85
+ validate_schema(segment_metadata, MetadataSchema).model_dump(),
86
+ str(uuid.uuid4()),
87
+ ]
88
+ )
89
+ else:
90
+ audio_metadata = {"audio_transcript": transcript}
91
+ metadata["audio_metadata"] = validate_schema(audio_metadata, AudioMetadataSchema).model_dump()
92
+ extracted_data.append(
93
+ [ContentTypeEnum.AUDIO, validate_schema(metadata, MetadataSchema).model_dump(), str(uuid.uuid4())]
94
+ )
75
95
 
76
- return metadata
96
+ return extracted_data
77
97
 
78
98
 
79
99
  def extract_text_from_audio_internal(
@@ -121,6 +141,7 @@ def extract_text_from_audio_internal(
121
141
  function_id = extract_params.get("function_id") or audio_extraction_config.function_id
122
142
  use_ssl = extract_params.get("use_ssl") or audio_extraction_config.use_ssl
123
143
  ssl_cert = extract_params.get("ssl_cert") or audio_extraction_config.ssl_cert
144
+ segment_audio = extract_params.get("segment_audio") or audio_extraction_config.segment_audio
124
145
 
125
146
  parakeet_client = create_audio_inference_client(
126
147
  (grpc_endpoint, http_endpoint),
@@ -136,12 +157,27 @@ def extract_text_from_audio_internal(
136
157
  logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
137
158
 
138
159
  try:
139
- # Apply the _update_metadata function to each row in the DataFrame
140
- df_extraction_ledger["metadata"] = df_extraction_ledger.apply(
141
- _update_audio_metadata, axis=1, args=(parakeet_client, execution_trace_log)
160
+ # Create a partial function to extract using the provided configurations.
161
+ _extract_from_audio_partial = functools.partial(
162
+ _extract_from_audio,
163
+ audio_client=parakeet_client,
164
+ trace_info=execution_trace_log,
165
+ segment_audio=segment_audio,
142
166
  )
143
167
 
144
- return df_extraction_ledger, execution_trace_log
168
+ # Apply the _extract_from_audio_partial function to each row in the DataFrame
169
+ extraction_series = df_extraction_ledger.apply(_extract_from_audio_partial, axis=1)
170
+
171
+ # Explode the results if the extraction returns lists.
172
+ extraction_series = extraction_series.explode().dropna()
173
+
174
+ # Convert the extracted results into a DataFrame.
175
+ if not extraction_series.empty:
176
+ extracted_df = pd.DataFrame(extraction_series.to_list(), columns=["document_type", "metadata", "uuid"])
177
+ else:
178
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
179
+
180
+ return extracted_df, execution_trace_log
145
181
 
146
182
  except Exception as e:
147
183
  logger.exception(f"Error occurred while extracting audio data: {e}", exc_info=True)
@@ -101,7 +101,7 @@ class ParakeetClient:
101
101
  segments, transcript = process_transcription_response(response)
102
102
  logger.debug("Processing Parakeet inference results (pass-through).")
103
103
 
104
- return transcript
104
+ return segments, transcript
105
105
 
106
106
  def transcribe(
107
107
  self,
@@ -48,6 +48,7 @@ class AudioConfigSchema(BaseModel):
48
48
  function_id: Optional[str] = None
49
49
  use_ssl: Optional[bool] = None
50
50
  ssl_cert: Optional[str] = None
51
+ segment_audio: Optional[bool] = None
51
52
 
52
53
  @root_validator(pre=True)
53
54
  def validate_endpoints(cls, values):
@@ -124,6 +124,7 @@ class IngestTaskAudioExtraction(BaseModelNoExt):
124
124
  function_id: Optional[str] = None
125
125
  use_ssl: Optional[bool] = None
126
126
  ssl_cert: Optional[str] = None
127
+ segment_audio: Optional[bool] = None
127
128
 
128
129
 
129
130
  class IngestTaskTableExtraction(BaseModelNoExt):
@@ -97,6 +97,8 @@ class ContentMetadataSchema(BaseModelNoExt):
97
97
  page_number: int = -1
98
98
  hierarchy: ContentHierarchySchema = ContentHierarchySchema()
99
99
  subtype: Union[ContentTypeEnum, str] = ""
100
+ start_time: int = -1
101
+ end_time: int = -1
100
102
 
101
103
 
102
104
  class TextMetadataSchema(BaseModelNoExt):
@@ -31,9 +31,16 @@ def _build_split_documents(row, chunks: List[str]) -> List[dict[str, Any]]:
31
31
  metadata = row.metadata if hasattr(row, "metadata") and isinstance(row.metadata, dict) else {}
32
32
  metadata = copy.deepcopy(metadata)
33
33
 
34
- metadata["content"] = text
35
-
36
- documents.append({"document_type": ContentTypeEnum.TEXT.value, "metadata": metadata, "uuid": str(uuid.uuid4())})
34
+ if row.document_type == ContentTypeEnum.AUDIO:
35
+ metadata["audio_metadata"]["audio_transcript"] = text
36
+ documents.append(
37
+ {"document_type": ContentTypeEnum.AUDIO.value, "metadata": metadata, "uuid": str(uuid.uuid4())}
38
+ )
39
+ else:
40
+ metadata["content"] = text
41
+ documents.append(
42
+ {"document_type": ContentTypeEnum.TEXT.value, "metadata": metadata, "uuid": str(uuid.uuid4())}
43
+ )
37
44
 
38
45
  return documents
39
46
 
@@ -118,7 +125,7 @@ def transform_text_split_and_tokenize_internal(
118
125
  )
119
126
 
120
127
  # Filter to documents with text content.
121
- text_type_condition = df_transform_ledger["document_type"] == ContentTypeEnum.TEXT
128
+ text_type_condition = df_transform_ledger["document_type"].isin([ContentTypeEnum.TEXT, ContentTypeEnum.AUDIO])
122
129
 
123
130
  normalized_meta_df = pd.json_normalize(df_transform_ledger["metadata"], errors="ignore")
124
131
  if "source_metadata.source_type" in normalized_meta_df.columns:
@@ -147,7 +154,14 @@ def transform_text_split_and_tokenize_internal(
147
154
 
148
155
  split_docs: List[Dict[str, Any]] = []
149
156
  for _, row in df_filtered.iterrows():
150
- content: str = row["metadata"]["content"] if row["metadata"]["content"] is not None else ""
157
+ if row["document_type"] == ContentTypeEnum.AUDIO:
158
+ content: str = (
159
+ row["metadata"]["audio_metadata"]["audio_transcript"]
160
+ if row["metadata"]["audio_metadata"]["audio_transcript"] is not None
161
+ else ""
162
+ )
163
+ else:
164
+ content: str = row["metadata"]["content"] if row["metadata"]["content"] is not None else ""
151
165
  chunks: List[str] = _split_into_chunks(content, tokenizer_model, chunk_size, chunk_overlap)
152
166
  split_docs.extend(_build_split_documents(row, chunks))
153
167
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 25.6.2
3
+ Version: 25.6.26.dev20250626
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -5,7 +5,6 @@
5
5
 
6
6
  import datetime
7
7
  import os
8
- import re
9
8
 
10
9
 
11
10
  def get_version():
@@ -16,13 +15,6 @@ def get_version():
16
15
  if not version:
17
16
  version = f"{datetime.datetime.now().strftime('%Y.%m.%d')}"
18
17
 
19
- # We only check this for dev, we assume for release the user knows what they are doing
20
- if release_type != "release":
21
- # Ensure the version is PEP 440 compatible
22
- pep440_regex = r"^\d{4}\.\d{1,2}\.\d{1,2}$"
23
- if not re.match(pep440_regex, version):
24
- raise ValueError(f"Version '{version}' is not PEP 440 compatible")
25
-
26
18
  # Construct the final version string
27
19
  if release_type == "dev":
28
20
  # If rev is not specified and defaults to 0 lets create a more meaningful development