nv-ingest-api 2025.9.23.dev20250923__tar.gz → 2025.9.26.dev20250926__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (180) hide show
  1. {nv_ingest_api-2025.9.23.dev20250923/src/nv_ingest_api.egg-info → nv_ingest_api-2025.9.26.dev20250926}/PKG-INFO +5 -1
  2. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/pyproject.toml +4 -0
  3. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +1 -5
  4. nv_ingest_api-2025.9.26.dev20250926/src/nv_ingest_api/util/dataloader/__init__.py +9 -0
  5. nv_ingest_api-2025.9.26.dev20250926/src/nv_ingest_api/util/dataloader/dataloader.py +371 -0
  6. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926/src/nv_ingest_api.egg-info}/PKG-INFO +5 -1
  7. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api.egg-info/SOURCES.txt +2 -0
  8. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api.egg-info/requires.txt +4 -0
  9. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/LICENSE +0 -0
  10. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/MANIFEST.in +0 -0
  11. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/README.md +0 -0
  12. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/setup.cfg +0 -0
  13. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/__init__.py +0 -0
  14. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/interface/__init__.py +0 -0
  15. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/interface/extract.py +0 -0
  16. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/interface/mutate.py +0 -0
  17. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/interface/store.py +0 -0
  18. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/interface/transform.py +0 -0
  19. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/interface/utility.py +0 -0
  20. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/__init__.py +0 -0
  21. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
  22. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/enums/common.py +0 -0
  23. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
  24. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
  25. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
  26. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
  27. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
  28. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  29. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
  30. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
  31. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -0
  32. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/html/__init__.py +0 -0
  33. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/html/html_extractor.py +0 -0
  34. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
  35. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -0
  36. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -0
  37. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
  38. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
  39. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -0
  40. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -0
  41. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
  42. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
  43. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
  44. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
  45. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
  46. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
  47. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -0
  48. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
  49. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
  50. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
  51. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
  52. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  53. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -0
  54. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
  55. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/meta/__init__.py +0 -0
  56. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/meta/udf.py +0 -0
  57. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/mutate/__init__.py +0 -0
  58. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
  59. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
  60. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
  61. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
  62. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +0 -0
  63. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
  64. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
  65. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -0
  66. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
  67. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
  68. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
  69. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
  70. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
  71. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +0 -0
  72. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
  73. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
  74. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
  75. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +0 -0
  76. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
  77. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  78. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
  79. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
  80. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -0
  81. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/__init__.py +0 -0
  82. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -0
  83. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -0
  84. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
  85. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
  86. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_html_schema.py +0 -0
  87. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
  88. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
  89. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -0
  90. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
  91. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
  92. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
  93. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
  94. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
  95. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
  96. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -0
  97. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
  98. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -0
  99. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
  100. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/meta/udf.py +0 -0
  101. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -0
  102. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
  103. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -0
  104. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
  105. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
  106. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -0
  107. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
  108. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
  109. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
  110. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
  111. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/store/__init__.py +0 -0
  112. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
  113. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
  114. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/transform/__init__.py +0 -0
  115. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
  116. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/transform/embed_text.py +0 -0
  117. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/transform/split_text.py +0 -0
  118. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/__init__.py +0 -0
  119. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
  120. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/control_message/validators.py +0 -0
  121. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/__init__.py +0 -0
  122. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
  123. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/containers.py +0 -0
  124. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/datetools.py +0 -0
  125. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/dftools.py +0 -0
  126. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/formats.py +0 -0
  127. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
  128. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
  129. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/detectors/language.py +0 -0
  130. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  131. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
  132. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -0
  133. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
  134. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
  135. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
  136. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
  137. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
  138. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
  139. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
  140. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
  141. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/imports/__init__.py +0 -0
  142. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/imports/callable_signatures.py +0 -0
  143. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/imports/dynamic_resolvers.py +0 -0
  144. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/introspection/__init__.py +0 -0
  145. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/introspection/class_inspect.py +0 -0
  146. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/introspection/function_inspect.py +0 -0
  147. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/logging/__init__.py +0 -0
  148. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/logging/configuration.py +0 -0
  149. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/logging/sanitize.py +0 -0
  150. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/message_brokers/__init__.py +0 -0
  151. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
  152. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
  153. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
  154. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
  155. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
  156. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
  157. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
  158. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
  159. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/nim/__init__.py +0 -0
  160. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
  161. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
  162. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/schema/__init__.py +0 -0
  163. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
  164. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/__init__.py +0 -0
  165. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
  166. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
  167. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  168. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
  169. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  170. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +0 -0
  171. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
  172. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/string_processing/configuration.py +0 -0
  173. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/string_processing/yaml.py +0 -0
  174. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/system/__init__.py +0 -0
  175. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/system/hardware_info.py +0 -0
  176. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
  177. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
  178. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/udfs/__init__.py +0 -0
  179. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/udfs/llm_summarizer_udf.py +0 -0
  180. {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.9.23.dev20250923
3
+ Version: 2025.9.26.dev20250926
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -214,9 +214,13 @@ Classifier: Operating System :: OS Independent
214
214
  Description-Content-Type: text/markdown
215
215
  License-File: LICENSE
216
216
  Requires-Dist: backoff==2.2.1
217
+ Requires-Dist: moviepy==2.2.1
217
218
  Requires-Dist: pandas>=2.0
218
219
  Requires-Dist: pydantic>2.0.0
219
220
  Requires-Dist: pydantic-settings>2.0.0
221
+ Requires-Dist: fsspec>=2025.5.1
222
+ Requires-Dist: universal_pathlib>=0.2.6
223
+ Requires-Dist: ffmpeg-python==0.2.0
220
224
  Requires-Dist: tritonclient
221
225
  Dynamic: license-file
222
226
 
@@ -21,9 +21,13 @@ classifiers = [
21
21
  ]
22
22
  dependencies = [
23
23
  "backoff==2.2.1",
24
+ "moviepy==2.2.1",
24
25
  "pandas>=2.0",
25
26
  "pydantic>2.0.0",
26
27
  "pydantic-settings>2.0.0",
28
+ "fsspec>=2025.5.1",
29
+ "universal_pathlib>=0.2.6",
30
+ "ffmpeg-python==0.2.0",
27
31
  "tritonclient",
28
32
  ]
29
33
 
@@ -227,11 +227,7 @@ def convert_to_mono_wav(audio_bytes):
227
227
  """
228
228
 
229
229
  if librosa is None:
230
- raise ImportError(
231
- "Librosa is required for audio processing. "
232
- "If you are running this code with the ingest container, it can be installed by setting "
233
- "the environment variable. INSTALL_AUDIO_EXTRACTION_DEPS=true"
234
- )
230
+ raise ImportError("Librosa is required for audio processing. ")
235
231
 
236
232
  # Create a BytesIO object from the audio bytes
237
233
  byte_io = io.BytesIO(audio_bytes)
@@ -0,0 +1,9 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2025, NVIDIA CORPORATION.
6
+
7
+ from nv_ingest_api.util.dataloader.dataloader import DataLoader, MediaInterface
8
+
9
+ __all__ = ["DataLoader", "MediaInterface"]
@@ -0,0 +1,371 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2025, NVIDIA CORPORATION.
6
+ from pathlib import Path
7
+ from abc import ABC, abstractmethod
8
+ import queue
9
+ import threading
10
+ import subprocess
11
+ import json
12
+ import logging
13
+ import math
14
+ import importlib.util
15
+ from enum import Enum
16
+ from concurrent.futures import ThreadPoolExecutor
17
+ from tqdm import tqdm
18
+ import os
19
+ import glob
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ try:
24
+ importlib.util.find_spec("ffmpeg")
25
+ subprocess.run(["ffmpeg", "-version"], capture_output=True)
26
+ except Exception:
27
+ logger.error(
28
+ "Unable to load the Dataloader, ffmpeg was not installed, "
29
+ "please install it using `pip install ffmpeg-python` and `apt-get install ffmpeg`"
30
+ )
31
+ ffmpeg = None
32
+ else:
33
+ import ffmpeg
34
+
35
+ if not ffmpeg:
36
+ DataLoader = None
37
+ MediaInterface = None
38
+ else:
39
+
40
+ class SplitType(Enum):
41
+ FRAME = "frame"
42
+ TIME = "time"
43
+ SIZE = "size"
44
+
45
+ class LoaderInterface(ABC):
46
+
47
+ @abstractmethod
48
+ def split(self, input_path: str, output_dir: str, split_interval: int = 0):
49
+ pass
50
+
51
+ @abstractmethod
52
+ def _get_path_metadata(self, path: str = None):
53
+ pass
54
+
55
+ def _probe(filename, format=None, file_handle=None, timeout=None, **kwargs):
56
+ args = ["ffprobe", "-show_format", "-show_streams", "-of", "json"]
57
+ args += ffmpeg._utils.convert_kwargs_to_cmd_line_args(kwargs)
58
+ if file_handle:
59
+ args += ["pipe:"]
60
+ else:
61
+ args += [filename]
62
+ p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
63
+ communicate_kwargs = {}
64
+ if timeout is not None:
65
+ communicate_kwargs["timeout"] = timeout
66
+ if file_handle:
67
+ communicate_kwargs["input"] = file_handle if file_handle else filename
68
+ out, err = p.communicate(**communicate_kwargs)
69
+ if p.returncode != 0:
70
+ raise ffmpeg._run.Error("ffprobe", out, err)
71
+ return json.loads(out.decode("utf-8"))
72
+
73
+ def _get_audio_from_video(input_path: str, output_file: str, cache_path: str = None):
74
+ """
75
+ Get the audio from a video file. if audio extraction fails, return None.
76
+ input_path: str, path to the video file
77
+ output_dir: str, path to the output directory
78
+ cache_path: str, path to the cache directory
79
+ """
80
+ output_path = Path(output_file)
81
+ output_dir = output_path.parent
82
+ output_dir.mkdir(parents=True, exist_ok=True)
83
+ try:
84
+ capture_output, capture_error = (
85
+ ffmpeg.input(str(input_path))
86
+ .output(str(output_path), acodec="libmp3lame", map="0:a")
87
+ .overwrite_output()
88
+ .run(capture_stdout=True, capture_stderr=True)
89
+ )
90
+ return output_path
91
+ except ffmpeg.Error as e:
92
+ logging.error(f"FFmpeg error for file {input_path}: {e.stderr.decode()}")
93
+ return None
94
+
95
+ def strip_audio_from_video_files(input_path: str, output_dir: str, cache_path: str = None):
96
+ """
97
+ Strip the audio from a series of video files and return the paths to the new files.
98
+ input_path: str, path to the video file
99
+ output_dir: str, path to the output directory
100
+ cache_path: str, path to the cache directory
101
+ """
102
+ output_path = Path(output_dir)
103
+ output_path.mkdir(parents=True, exist_ok=True)
104
+ futures = []
105
+ results = None
106
+ path = Path(input_path)
107
+ files = [path] if path.is_file() else glob.glob(os.path.join(path, "*.mp4"))
108
+ files = [Path(file) for file in files]
109
+ with ThreadPoolExecutor(max_workers=15) as executor:
110
+ futures = [executor.submit(_get_audio_from_video, file, output_path / f"{file.stem}.mp3") for file in files]
111
+ results = [str(future.result()) for future in tqdm(futures)]
112
+ return results
113
+
114
+ class MediaInterface(LoaderInterface):
115
+
116
+ def __init__(self):
117
+ self.path_metadata = {}
118
+
119
+ def probe_media(self, path_file: Path, split_interval: int, split_type: SplitType, file_handle=None):
120
+ num_splits = None
121
+ duration = None
122
+ probe = None
123
+ sample_rate = None
124
+ try:
125
+ file_size = path_file.stat().st_size # in bytes
126
+ if file_handle:
127
+ probe = _probe("pipe:", format=path_file.suffix, file_handle=file_handle)
128
+ else:
129
+ probe = _probe(str(path_file), format=path_file.suffix)
130
+ if probe["streams"][0]["codec_type"] == "video":
131
+ sample_rate = float(probe["streams"][0]["avg_frame_rate"].split("/")[0])
132
+ duration = float(probe["format"]["duration"])
133
+ elif probe["streams"][0]["codec_type"] == "audio":
134
+ sample_rate = float(probe["streams"][0]["sample_rate"])
135
+ bitrate = probe["format"]["bit_rate"]
136
+ duration = (file_size * 8) / float(bitrate)
137
+ num_splits = self.find_num_splits(file_size, sample_rate, duration, split_interval, split_type)
138
+ except ffmpeg.Error as e:
139
+ logging.error(f"FFmpeg error for file {path_file}: {e.stderr.decode()}")
140
+ except ValueError as e:
141
+ logging.error(f"Error finding number of splits for file {path_file}: {e}")
142
+ return probe, num_splits, duration
143
+
144
+ def get_audio_from_video(self, input_path: str, output_file: str, cache_path: str = None):
145
+ return _get_audio_from_video(input_path, output_file, cache_path)
146
+
147
+ def split(
148
+ self,
149
+ input_path: str,
150
+ output_dir: str,
151
+ split_interval: int = 0,
152
+ split_type: SplitType = SplitType.SIZE,
153
+ cache_path: str = None,
154
+ video_audio_separate: bool = False,
155
+ ):
156
+ """
157
+ Split a media file into smaller chunks of `split_interval` size. if
158
+ video_audio_separate is True and the file is a video, the audio will be
159
+ extracted from the video and saved to a separate files. Data can be returned
160
+ as a tuple of (video_files, audio_files) or just files (i.e. audio files).
161
+ input_path: str, path to the media file
162
+ output_dir: str, path to the output directory
163
+ split_interval: the size of the chunk to split the media file into depending on the split type
164
+ split_type: SplitType, type of split to perform, either size, time, or frame
165
+ video_audio_separate: bool, whether to separate the video and audio files
166
+ """
167
+ import ffmpeg
168
+
169
+ path_file = Path(input_path)
170
+ file_name = path_file.stem
171
+ suffix = path_file.suffix
172
+ output_dir = Path(output_dir)
173
+ output_dir.mkdir(parents=True, exist_ok=True)
174
+ output_pattern = output_dir / f"{file_name}_chunk_%04d{suffix}"
175
+ num_splits = 0
176
+ cache_path = cache_path if cache_path else output_dir
177
+ try:
178
+ probe = None
179
+ probe, num_splits, duration = self.probe_media(path_file, split_interval, split_type)
180
+ segment_time = math.ceil(duration / num_splits)
181
+ output_kwargs = {
182
+ "f": "segment",
183
+ "segment_time": segment_time,
184
+ "c": "copy",
185
+ "map": "0",
186
+ }
187
+ if suffix == ".mp4":
188
+ output_kwargs.update(
189
+ {
190
+ "force_key_frames": f"expr:gte(t,n_forced*{segment_time})",
191
+ "crf": 22,
192
+ "g": 50,
193
+ "sc_threshold": 0,
194
+ }
195
+ )
196
+ capture_output, capture_error = (
197
+ ffmpeg.input(str(input_path))
198
+ .output(str(output_pattern), **output_kwargs)
199
+ .run(capture_stdout=True, capture_stderr=True)
200
+ )
201
+ logging.debug(f"Split {input_path} into {num_splits} chunks")
202
+ self.path_metadata[input_path] = probe
203
+ print(capture_output)
204
+ print(capture_error)
205
+ except ffmpeg.Error as e:
206
+ logging.error(f"FFmpeg error for file {input_path}: {e.stderr.decode()}")
207
+ files = [str(output_dir / f"{file_name}_chunk_{i:04d}{suffix}") for i in range(int(num_splits))]
208
+ if video_audio_separate and suffix in [".mp4", ".mov", ".avi", ".mkv"]:
209
+ video_audio_files = []
210
+ for file in files:
211
+ file = Path(file)
212
+ audio_path = self.get_audio_from_video(file, file.with_suffix(".mp3"), cache_path)
213
+ if audio_path is not None:
214
+ video_audio_files.append(audio_path)
215
+ else:
216
+ logging.error(f"Failed to extract audio from {file}")
217
+ return list(zip(files, video_audio_files))
218
+ return files
219
+
220
+ def find_num_splits(
221
+ self,
222
+ file_size: int,
223
+ sample_rate: float,
224
+ duration: float,
225
+ split_interval: int,
226
+ split_type: SplitType,
227
+ ):
228
+ """
229
+ Find the number of splits for a media file based on the split type and interval.
230
+ file_size: int, size of the media file in bytes
231
+ sample_rate: float, sample rate of the media file in samples per second
232
+ duration: float, duration of the media file in seconds
233
+ split_interval: int, size of the chunk to split the media file into depending on the split type
234
+ split_type: SplitType, type of split to perform, either size, time, or frame
235
+ """
236
+ if split_type == SplitType.SIZE:
237
+ return math.ceil(file_size / split_interval)
238
+ elif split_type == SplitType.TIME:
239
+ return math.ceil(duration / split_interval)
240
+ elif split_type == SplitType.FRAME:
241
+ seconds_cap = split_interval / sample_rate
242
+ return math.ceil(duration / seconds_cap)
243
+ else:
244
+ raise ValueError(f"Invalid split type: {split_type}")
245
+
246
+ def _get_path_metadata(self):
247
+ """
248
+ Get the metadata for a path.
249
+ path: str, path to get the metadata for if None, get the metadata for all paths
250
+ """
251
+ return self.path_metadata
252
+
253
+ def load_data(queue: queue.Queue, paths: list[str], thread_stop: threading.Event):
254
+ file = None
255
+ try:
256
+ for file in paths:
257
+ if isinstance(file, tuple):
258
+ video_file, audio_file = file
259
+ with open(video_file, "rb") as f:
260
+ video = f.read()
261
+ with open(audio_file, "rb") as f:
262
+ audio = f.read()
263
+ queue.put((video, audio))
264
+ else:
265
+ if thread_stop:
266
+ return
267
+ with open(file, "rb") as f:
268
+ queue.put(f.read())
269
+ except Exception as e:
270
+ logging.error(f"Error processing file {file}: {e}")
271
+ queue.put(RuntimeError(f"Error processing file {file}: {e}"))
272
+ queue.put(StopIteration)
273
+
274
+ class DataLoader:
275
+ """
276
+ DataLoader is a class that is used to load data from a list of paths and push it to a queue.
277
+ paths: list[str], list of paths to process
278
+ size: int, size of the queue
279
+ """
280
+
281
+ def __init__(
282
+ self,
283
+ path: str,
284
+ output_dir: str,
285
+ split_type: SplitType = SplitType.SIZE,
286
+ split_interval: int = 450,
287
+ interface: LoaderInterface = None,
288
+ size: int = 2,
289
+ video_audio_separate: bool = False,
290
+ ):
291
+ interface = interface if interface else MediaInterface()
292
+ self.thread = None
293
+ self.thread_stop = False
294
+ self.queue = queue.Queue(size)
295
+ self.path = Path(path)
296
+ self.output_dir = output_dir
297
+ self.split_interval = split_interval
298
+ self.interface = interface
299
+ self.files_completed = []
300
+ self.split_type = split_type
301
+ self.video_audio_separate = video_audio_separate
302
+ # process the file immediately on instantiation
303
+ self._split()
304
+
305
+ def _split(self):
306
+ self.files_completed = self.interface.split(
307
+ self.path,
308
+ self.output_dir,
309
+ split_interval=self.split_interval,
310
+ split_type=self.split_type,
311
+ video_audio_separate=self.video_audio_separate,
312
+ )
313
+
314
+ def __next__(self):
315
+ payload = self.queue.get()
316
+ if payload == StopIteration:
317
+ raise payload
318
+ else:
319
+ return payload
320
+
321
+ def stop(self):
322
+ """
323
+ Reset itertor by stopping the thread and clearing the queue.
324
+ """
325
+ if self.thread:
326
+ self.thread_stop = True
327
+ self.thread.join()
328
+ self.thread_stop = False
329
+ while self.queue.qsize() != 0:
330
+ with self.queue.mutex:
331
+ self.queue.queue.clear()
332
+
333
+ def __iter__(self):
334
+ self.stop()
335
+ self.thread_stop = False
336
+ self.thread = threading.Thread(
337
+ target=load_data,
338
+ args=(
339
+ self.queue,
340
+ self.files_completed,
341
+ self.thread_stop,
342
+ ),
343
+ daemon=True,
344
+ )
345
+ self.thread.start()
346
+ return self
347
+
348
+ def __len__(self):
349
+ return len(self.files_completed)
350
+
351
+ def __getitem__(self, index):
352
+ try:
353
+ with open(self.files_completed[index], "rb") as f:
354
+ return f.read()
355
+ except Exception as e:
356
+ logging.error(f"Error getting item {index}: {e}")
357
+ raise e
358
+
359
+ def __del__(self):
360
+ self.stop()
361
+
362
+ def __exit__(self, exc_type, exc_value, traceback):
363
+ self.stop()
364
+
365
+ def get_metadata(self):
366
+ """
367
+ Get the metadata for a path.
368
+ path: str, path to get the metadata for if None, get the metadata for all paths
369
+ """
370
+
371
+ return self.interface._get_path_metadata()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.9.23.dev20250923
3
+ Version: 2025.9.26.dev20250926
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -214,9 +214,13 @@ Classifier: Operating System :: OS Independent
214
214
  Description-Content-Type: text/markdown
215
215
  License-File: LICENSE
216
216
  Requires-Dist: backoff==2.2.1
217
+ Requires-Dist: moviepy==2.2.1
217
218
  Requires-Dist: pandas>=2.0
218
219
  Requires-Dist: pydantic>2.0.0
219
220
  Requires-Dist: pydantic-settings>2.0.0
221
+ Requires-Dist: fsspec>=2025.5.1
222
+ Requires-Dist: universal_pathlib>=0.2.6
223
+ Requires-Dist: ffmpeg-python==0.2.0
220
224
  Requires-Dist: tritonclient
221
225
  Dynamic: license-file
222
226
 
@@ -124,6 +124,8 @@ src/nv_ingest_api/util/converters/datetools.py
124
124
  src/nv_ingest_api/util/converters/dftools.py
125
125
  src/nv_ingest_api/util/converters/formats.py
126
126
  src/nv_ingest_api/util/converters/type_mappings.py
127
+ src/nv_ingest_api/util/dataloader/__init__.py
128
+ src/nv_ingest_api/util/dataloader/dataloader.py
127
129
  src/nv_ingest_api/util/detectors/__init__.py
128
130
  src/nv_ingest_api/util/detectors/language.py
129
131
  src/nv_ingest_api/util/exception_handlers/__init__.py
@@ -1,5 +1,9 @@
1
1
  backoff==2.2.1
2
+ moviepy==2.2.1
2
3
  pandas>=2.0
3
4
  pydantic>2.0.0
4
5
  pydantic-settings>2.0.0
6
+ fsspec>=2025.5.1
7
+ universal_pathlib>=0.2.6
8
+ ffmpeg-python==0.2.0
5
9
  tritonclient