nv-ingest-api 2025.9.23.dev20250923__tar.gz → 2025.9.26.dev20250926__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- {nv_ingest_api-2025.9.23.dev20250923/src/nv_ingest_api.egg-info → nv_ingest_api-2025.9.26.dev20250926}/PKG-INFO +5 -1
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/pyproject.toml +4 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +1 -5
- nv_ingest_api-2025.9.26.dev20250926/src/nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api-2025.9.26.dev20250926/src/nv_ingest_api/util/dataloader/dataloader.py +371 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926/src/nv_ingest_api.egg-info}/PKG-INFO +5 -1
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api.egg-info/SOURCES.txt +2 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api.egg-info/requires.txt +4 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/LICENSE +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/MANIFEST.in +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/README.md +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/setup.cfg +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/interface/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/interface/extract.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/interface/mutate.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/interface/store.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/interface/transform.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/interface/utility.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/enums/common.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/html/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/html/html_extractor.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/meta/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/meta/udf.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/mutate/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_html_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/meta/udf.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/store/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/transform/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/transform/embed_text.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/internal/transform/split_text.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/control_message/validators.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/containers.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/datetools.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/dftools.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/formats.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/detectors/language.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/imports/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/imports/callable_signatures.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/imports/dynamic_resolvers.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/introspection/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/introspection/class_inspect.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/introspection/function_inspect.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/logging/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/logging/configuration.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/logging/sanitize.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/message_brokers/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/nim/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/schema/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/string_processing/configuration.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/string_processing/yaml.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/system/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api/util/system/hardware_info.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/udfs/__init__.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/udfs/llm_summarizer_udf.py +0 -0
- {nv_ingest_api-2025.9.23.dev20250923 → nv_ingest_api-2025.9.26.dev20250926}/src/version.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest-api
|
|
3
|
-
Version: 2025.9.
|
|
3
|
+
Version: 2025.9.26.dev20250926
|
|
4
4
|
Summary: Python module with core document ingestion functions.
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -214,9 +214,13 @@ Classifier: Operating System :: OS Independent
|
|
|
214
214
|
Description-Content-Type: text/markdown
|
|
215
215
|
License-File: LICENSE
|
|
216
216
|
Requires-Dist: backoff==2.2.1
|
|
217
|
+
Requires-Dist: moviepy==2.2.1
|
|
217
218
|
Requires-Dist: pandas>=2.0
|
|
218
219
|
Requires-Dist: pydantic>2.0.0
|
|
219
220
|
Requires-Dist: pydantic-settings>2.0.0
|
|
221
|
+
Requires-Dist: fsspec>=2025.5.1
|
|
222
|
+
Requires-Dist: universal_pathlib>=0.2.6
|
|
223
|
+
Requires-Dist: ffmpeg-python==0.2.0
|
|
220
224
|
Requires-Dist: tritonclient
|
|
221
225
|
Dynamic: license-file
|
|
222
226
|
|
|
@@ -21,9 +21,13 @@ classifiers = [
|
|
|
21
21
|
]
|
|
22
22
|
dependencies = [
|
|
23
23
|
"backoff==2.2.1",
|
|
24
|
+
"moviepy==2.2.1",
|
|
24
25
|
"pandas>=2.0",
|
|
25
26
|
"pydantic>2.0.0",
|
|
26
27
|
"pydantic-settings>2.0.0",
|
|
28
|
+
"fsspec>=2025.5.1",
|
|
29
|
+
"universal_pathlib>=0.2.6",
|
|
30
|
+
"ffmpeg-python==0.2.0",
|
|
27
31
|
"tritonclient",
|
|
28
32
|
]
|
|
29
33
|
|
|
@@ -227,11 +227,7 @@ def convert_to_mono_wav(audio_bytes):
|
|
|
227
227
|
"""
|
|
228
228
|
|
|
229
229
|
if librosa is None:
|
|
230
|
-
raise ImportError(
|
|
231
|
-
"Librosa is required for audio processing. "
|
|
232
|
-
"If you are running this code with the ingest container, it can be installed by setting "
|
|
233
|
-
"the environment variable. INSTALL_AUDIO_EXTRACTION_DEPS=true"
|
|
234
|
-
)
|
|
230
|
+
raise ImportError("Librosa is required for audio processing. ")
|
|
235
231
|
|
|
236
232
|
# Create a BytesIO object from the audio bytes
|
|
237
233
|
byte_io = io.BytesIO(audio_bytes)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# Copyright (c) 2025, NVIDIA CORPORATION.
|
|
6
|
+
|
|
7
|
+
from nv_ingest_api.util.dataloader.dataloader import DataLoader, MediaInterface
|
|
8
|
+
|
|
9
|
+
__all__ = ["DataLoader", "MediaInterface"]
|
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# Copyright (c) 2025, NVIDIA CORPORATION.
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
import queue
|
|
9
|
+
import threading
|
|
10
|
+
import subprocess
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import math
|
|
14
|
+
import importlib.util
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
import os
|
|
19
|
+
import glob
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
importlib.util.find_spec("ffmpeg")
|
|
25
|
+
subprocess.run(["ffmpeg", "-version"], capture_output=True)
|
|
26
|
+
except Exception:
|
|
27
|
+
logger.error(
|
|
28
|
+
"Unable to load the Dataloader, ffmpeg was not installed, "
|
|
29
|
+
"please install it using `pip install ffmpeg-python` and `apt-get install ffmpeg`"
|
|
30
|
+
)
|
|
31
|
+
ffmpeg = None
|
|
32
|
+
else:
|
|
33
|
+
import ffmpeg
|
|
34
|
+
|
|
35
|
+
if not ffmpeg:
|
|
36
|
+
DataLoader = None
|
|
37
|
+
MediaInterface = None
|
|
38
|
+
else:
|
|
39
|
+
|
|
40
|
+
class SplitType(Enum):
|
|
41
|
+
FRAME = "frame"
|
|
42
|
+
TIME = "time"
|
|
43
|
+
SIZE = "size"
|
|
44
|
+
|
|
45
|
+
class LoaderInterface(ABC):
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def split(self, input_path: str, output_dir: str, split_interval: int = 0):
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def _get_path_metadata(self, path: str = None):
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
def _probe(filename, format=None, file_handle=None, timeout=None, **kwargs):
|
|
56
|
+
args = ["ffprobe", "-show_format", "-show_streams", "-of", "json"]
|
|
57
|
+
args += ffmpeg._utils.convert_kwargs_to_cmd_line_args(kwargs)
|
|
58
|
+
if file_handle:
|
|
59
|
+
args += ["pipe:"]
|
|
60
|
+
else:
|
|
61
|
+
args += [filename]
|
|
62
|
+
p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
63
|
+
communicate_kwargs = {}
|
|
64
|
+
if timeout is not None:
|
|
65
|
+
communicate_kwargs["timeout"] = timeout
|
|
66
|
+
if file_handle:
|
|
67
|
+
communicate_kwargs["input"] = file_handle if file_handle else filename
|
|
68
|
+
out, err = p.communicate(**communicate_kwargs)
|
|
69
|
+
if p.returncode != 0:
|
|
70
|
+
raise ffmpeg._run.Error("ffprobe", out, err)
|
|
71
|
+
return json.loads(out.decode("utf-8"))
|
|
72
|
+
|
|
73
|
+
def _get_audio_from_video(input_path: str, output_file: str, cache_path: str = None):
|
|
74
|
+
"""
|
|
75
|
+
Get the audio from a video file. if audio extraction fails, return None.
|
|
76
|
+
input_path: str, path to the video file
|
|
77
|
+
output_dir: str, path to the output directory
|
|
78
|
+
cache_path: str, path to the cache directory
|
|
79
|
+
"""
|
|
80
|
+
output_path = Path(output_file)
|
|
81
|
+
output_dir = output_path.parent
|
|
82
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
try:
|
|
84
|
+
capture_output, capture_error = (
|
|
85
|
+
ffmpeg.input(str(input_path))
|
|
86
|
+
.output(str(output_path), acodec="libmp3lame", map="0:a")
|
|
87
|
+
.overwrite_output()
|
|
88
|
+
.run(capture_stdout=True, capture_stderr=True)
|
|
89
|
+
)
|
|
90
|
+
return output_path
|
|
91
|
+
except ffmpeg.Error as e:
|
|
92
|
+
logging.error(f"FFmpeg error for file {input_path}: {e.stderr.decode()}")
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
def strip_audio_from_video_files(input_path: str, output_dir: str, cache_path: str = None):
|
|
96
|
+
"""
|
|
97
|
+
Strip the audio from a series of video files and return the paths to the new files.
|
|
98
|
+
input_path: str, path to the video file
|
|
99
|
+
output_dir: str, path to the output directory
|
|
100
|
+
cache_path: str, path to the cache directory
|
|
101
|
+
"""
|
|
102
|
+
output_path = Path(output_dir)
|
|
103
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
104
|
+
futures = []
|
|
105
|
+
results = None
|
|
106
|
+
path = Path(input_path)
|
|
107
|
+
files = [path] if path.is_file() else glob.glob(os.path.join(path, "*.mp4"))
|
|
108
|
+
files = [Path(file) for file in files]
|
|
109
|
+
with ThreadPoolExecutor(max_workers=15) as executor:
|
|
110
|
+
futures = [executor.submit(_get_audio_from_video, file, output_path / f"{file.stem}.mp3") for file in files]
|
|
111
|
+
results = [str(future.result()) for future in tqdm(futures)]
|
|
112
|
+
return results
|
|
113
|
+
|
|
114
|
+
class MediaInterface(LoaderInterface):
|
|
115
|
+
|
|
116
|
+
def __init__(self):
|
|
117
|
+
self.path_metadata = {}
|
|
118
|
+
|
|
119
|
+
def probe_media(self, path_file: Path, split_interval: int, split_type: SplitType, file_handle=None):
|
|
120
|
+
num_splits = None
|
|
121
|
+
duration = None
|
|
122
|
+
probe = None
|
|
123
|
+
sample_rate = None
|
|
124
|
+
try:
|
|
125
|
+
file_size = path_file.stat().st_size # in bytes
|
|
126
|
+
if file_handle:
|
|
127
|
+
probe = _probe("pipe:", format=path_file.suffix, file_handle=file_handle)
|
|
128
|
+
else:
|
|
129
|
+
probe = _probe(str(path_file), format=path_file.suffix)
|
|
130
|
+
if probe["streams"][0]["codec_type"] == "video":
|
|
131
|
+
sample_rate = float(probe["streams"][0]["avg_frame_rate"].split("/")[0])
|
|
132
|
+
duration = float(probe["format"]["duration"])
|
|
133
|
+
elif probe["streams"][0]["codec_type"] == "audio":
|
|
134
|
+
sample_rate = float(probe["streams"][0]["sample_rate"])
|
|
135
|
+
bitrate = probe["format"]["bit_rate"]
|
|
136
|
+
duration = (file_size * 8) / float(bitrate)
|
|
137
|
+
num_splits = self.find_num_splits(file_size, sample_rate, duration, split_interval, split_type)
|
|
138
|
+
except ffmpeg.Error as e:
|
|
139
|
+
logging.error(f"FFmpeg error for file {path_file}: {e.stderr.decode()}")
|
|
140
|
+
except ValueError as e:
|
|
141
|
+
logging.error(f"Error finding number of splits for file {path_file}: {e}")
|
|
142
|
+
return probe, num_splits, duration
|
|
143
|
+
|
|
144
|
+
def get_audio_from_video(self, input_path: str, output_file: str, cache_path: str = None):
|
|
145
|
+
return _get_audio_from_video(input_path, output_file, cache_path)
|
|
146
|
+
|
|
147
|
+
def split(
|
|
148
|
+
self,
|
|
149
|
+
input_path: str,
|
|
150
|
+
output_dir: str,
|
|
151
|
+
split_interval: int = 0,
|
|
152
|
+
split_type: SplitType = SplitType.SIZE,
|
|
153
|
+
cache_path: str = None,
|
|
154
|
+
video_audio_separate: bool = False,
|
|
155
|
+
):
|
|
156
|
+
"""
|
|
157
|
+
Split a media file into smaller chunks of `split_interval` size. if
|
|
158
|
+
video_audio_separate is True and the file is a video, the audio will be
|
|
159
|
+
extracted from the video and saved to a separate files. Data can be returned
|
|
160
|
+
as a tuple of (video_files, audio_files) or just files (i.e. audio files).
|
|
161
|
+
input_path: str, path to the media file
|
|
162
|
+
output_dir: str, path to the output directory
|
|
163
|
+
split_interval: the size of the chunk to split the media file into depending on the split type
|
|
164
|
+
split_type: SplitType, type of split to perform, either size, time, or frame
|
|
165
|
+
video_audio_separate: bool, whether to separate the video and audio files
|
|
166
|
+
"""
|
|
167
|
+
import ffmpeg
|
|
168
|
+
|
|
169
|
+
path_file = Path(input_path)
|
|
170
|
+
file_name = path_file.stem
|
|
171
|
+
suffix = path_file.suffix
|
|
172
|
+
output_dir = Path(output_dir)
|
|
173
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
174
|
+
output_pattern = output_dir / f"{file_name}_chunk_%04d{suffix}"
|
|
175
|
+
num_splits = 0
|
|
176
|
+
cache_path = cache_path if cache_path else output_dir
|
|
177
|
+
try:
|
|
178
|
+
probe = None
|
|
179
|
+
probe, num_splits, duration = self.probe_media(path_file, split_interval, split_type)
|
|
180
|
+
segment_time = math.ceil(duration / num_splits)
|
|
181
|
+
output_kwargs = {
|
|
182
|
+
"f": "segment",
|
|
183
|
+
"segment_time": segment_time,
|
|
184
|
+
"c": "copy",
|
|
185
|
+
"map": "0",
|
|
186
|
+
}
|
|
187
|
+
if suffix == ".mp4":
|
|
188
|
+
output_kwargs.update(
|
|
189
|
+
{
|
|
190
|
+
"force_key_frames": f"expr:gte(t,n_forced*{segment_time})",
|
|
191
|
+
"crf": 22,
|
|
192
|
+
"g": 50,
|
|
193
|
+
"sc_threshold": 0,
|
|
194
|
+
}
|
|
195
|
+
)
|
|
196
|
+
capture_output, capture_error = (
|
|
197
|
+
ffmpeg.input(str(input_path))
|
|
198
|
+
.output(str(output_pattern), **output_kwargs)
|
|
199
|
+
.run(capture_stdout=True, capture_stderr=True)
|
|
200
|
+
)
|
|
201
|
+
logging.debug(f"Split {input_path} into {num_splits} chunks")
|
|
202
|
+
self.path_metadata[input_path] = probe
|
|
203
|
+
print(capture_output)
|
|
204
|
+
print(capture_error)
|
|
205
|
+
except ffmpeg.Error as e:
|
|
206
|
+
logging.error(f"FFmpeg error for file {input_path}: {e.stderr.decode()}")
|
|
207
|
+
files = [str(output_dir / f"{file_name}_chunk_{i:04d}{suffix}") for i in range(int(num_splits))]
|
|
208
|
+
if video_audio_separate and suffix in [".mp4", ".mov", ".avi", ".mkv"]:
|
|
209
|
+
video_audio_files = []
|
|
210
|
+
for file in files:
|
|
211
|
+
file = Path(file)
|
|
212
|
+
audio_path = self.get_audio_from_video(file, file.with_suffix(".mp3"), cache_path)
|
|
213
|
+
if audio_path is not None:
|
|
214
|
+
video_audio_files.append(audio_path)
|
|
215
|
+
else:
|
|
216
|
+
logging.error(f"Failed to extract audio from {file}")
|
|
217
|
+
return list(zip(files, video_audio_files))
|
|
218
|
+
return files
|
|
219
|
+
|
|
220
|
+
def find_num_splits(
|
|
221
|
+
self,
|
|
222
|
+
file_size: int,
|
|
223
|
+
sample_rate: float,
|
|
224
|
+
duration: float,
|
|
225
|
+
split_interval: int,
|
|
226
|
+
split_type: SplitType,
|
|
227
|
+
):
|
|
228
|
+
"""
|
|
229
|
+
Find the number of splits for a media file based on the split type and interval.
|
|
230
|
+
file_size: int, size of the media file in bytes
|
|
231
|
+
sample_rate: float, sample rate of the media file in samples per second
|
|
232
|
+
duration: float, duration of the media file in seconds
|
|
233
|
+
split_interval: int, size of the chunk to split the media file into depending on the split type
|
|
234
|
+
split_type: SplitType, type of split to perform, either size, time, or frame
|
|
235
|
+
"""
|
|
236
|
+
if split_type == SplitType.SIZE:
|
|
237
|
+
return math.ceil(file_size / split_interval)
|
|
238
|
+
elif split_type == SplitType.TIME:
|
|
239
|
+
return math.ceil(duration / split_interval)
|
|
240
|
+
elif split_type == SplitType.FRAME:
|
|
241
|
+
seconds_cap = split_interval / sample_rate
|
|
242
|
+
return math.ceil(duration / seconds_cap)
|
|
243
|
+
else:
|
|
244
|
+
raise ValueError(f"Invalid split type: {split_type}")
|
|
245
|
+
|
|
246
|
+
def _get_path_metadata(self):
|
|
247
|
+
"""
|
|
248
|
+
Get the metadata for a path.
|
|
249
|
+
path: str, path to get the metadata for if None, get the metadata for all paths
|
|
250
|
+
"""
|
|
251
|
+
return self.path_metadata
|
|
252
|
+
|
|
253
|
+
def load_data(queue: queue.Queue, paths: list[str], thread_stop: threading.Event):
|
|
254
|
+
file = None
|
|
255
|
+
try:
|
|
256
|
+
for file in paths:
|
|
257
|
+
if isinstance(file, tuple):
|
|
258
|
+
video_file, audio_file = file
|
|
259
|
+
with open(video_file, "rb") as f:
|
|
260
|
+
video = f.read()
|
|
261
|
+
with open(audio_file, "rb") as f:
|
|
262
|
+
audio = f.read()
|
|
263
|
+
queue.put((video, audio))
|
|
264
|
+
else:
|
|
265
|
+
if thread_stop:
|
|
266
|
+
return
|
|
267
|
+
with open(file, "rb") as f:
|
|
268
|
+
queue.put(f.read())
|
|
269
|
+
except Exception as e:
|
|
270
|
+
logging.error(f"Error processing file {file}: {e}")
|
|
271
|
+
queue.put(RuntimeError(f"Error processing file {file}: {e}"))
|
|
272
|
+
queue.put(StopIteration)
|
|
273
|
+
|
|
274
|
+
class DataLoader:
|
|
275
|
+
"""
|
|
276
|
+
DataLoader is a class that is used to load data from a list of paths and push it to a queue.
|
|
277
|
+
paths: list[str], list of paths to process
|
|
278
|
+
size: int, size of the queue
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
def __init__(
|
|
282
|
+
self,
|
|
283
|
+
path: str,
|
|
284
|
+
output_dir: str,
|
|
285
|
+
split_type: SplitType = SplitType.SIZE,
|
|
286
|
+
split_interval: int = 450,
|
|
287
|
+
interface: LoaderInterface = None,
|
|
288
|
+
size: int = 2,
|
|
289
|
+
video_audio_separate: bool = False,
|
|
290
|
+
):
|
|
291
|
+
interface = interface if interface else MediaInterface()
|
|
292
|
+
self.thread = None
|
|
293
|
+
self.thread_stop = False
|
|
294
|
+
self.queue = queue.Queue(size)
|
|
295
|
+
self.path = Path(path)
|
|
296
|
+
self.output_dir = output_dir
|
|
297
|
+
self.split_interval = split_interval
|
|
298
|
+
self.interface = interface
|
|
299
|
+
self.files_completed = []
|
|
300
|
+
self.split_type = split_type
|
|
301
|
+
self.video_audio_separate = video_audio_separate
|
|
302
|
+
# process the file immediately on instantiation
|
|
303
|
+
self._split()
|
|
304
|
+
|
|
305
|
+
def _split(self):
|
|
306
|
+
self.files_completed = self.interface.split(
|
|
307
|
+
self.path,
|
|
308
|
+
self.output_dir,
|
|
309
|
+
split_interval=self.split_interval,
|
|
310
|
+
split_type=self.split_type,
|
|
311
|
+
video_audio_separate=self.video_audio_separate,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
def __next__(self):
|
|
315
|
+
payload = self.queue.get()
|
|
316
|
+
if payload == StopIteration:
|
|
317
|
+
raise payload
|
|
318
|
+
else:
|
|
319
|
+
return payload
|
|
320
|
+
|
|
321
|
+
def stop(self):
|
|
322
|
+
"""
|
|
323
|
+
Reset itertor by stopping the thread and clearing the queue.
|
|
324
|
+
"""
|
|
325
|
+
if self.thread:
|
|
326
|
+
self.thread_stop = True
|
|
327
|
+
self.thread.join()
|
|
328
|
+
self.thread_stop = False
|
|
329
|
+
while self.queue.qsize() != 0:
|
|
330
|
+
with self.queue.mutex:
|
|
331
|
+
self.queue.queue.clear()
|
|
332
|
+
|
|
333
|
+
def __iter__(self):
|
|
334
|
+
self.stop()
|
|
335
|
+
self.thread_stop = False
|
|
336
|
+
self.thread = threading.Thread(
|
|
337
|
+
target=load_data,
|
|
338
|
+
args=(
|
|
339
|
+
self.queue,
|
|
340
|
+
self.files_completed,
|
|
341
|
+
self.thread_stop,
|
|
342
|
+
),
|
|
343
|
+
daemon=True,
|
|
344
|
+
)
|
|
345
|
+
self.thread.start()
|
|
346
|
+
return self
|
|
347
|
+
|
|
348
|
+
def __len__(self):
|
|
349
|
+
return len(self.files_completed)
|
|
350
|
+
|
|
351
|
+
def __getitem__(self, index):
|
|
352
|
+
try:
|
|
353
|
+
with open(self.files_completed[index], "rb") as f:
|
|
354
|
+
return f.read()
|
|
355
|
+
except Exception as e:
|
|
356
|
+
logging.error(f"Error getting item {index}: {e}")
|
|
357
|
+
raise e
|
|
358
|
+
|
|
359
|
+
def __del__(self):
|
|
360
|
+
self.stop()
|
|
361
|
+
|
|
362
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
363
|
+
self.stop()
|
|
364
|
+
|
|
365
|
+
def get_metadata(self):
|
|
366
|
+
"""
|
|
367
|
+
Get the metadata for a path.
|
|
368
|
+
path: str, path to get the metadata for if None, get the metadata for all paths
|
|
369
|
+
"""
|
|
370
|
+
|
|
371
|
+
return self.interface._get_path_metadata()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest-api
|
|
3
|
-
Version: 2025.9.
|
|
3
|
+
Version: 2025.9.26.dev20250926
|
|
4
4
|
Summary: Python module with core document ingestion functions.
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -214,9 +214,13 @@ Classifier: Operating System :: OS Independent
|
|
|
214
214
|
Description-Content-Type: text/markdown
|
|
215
215
|
License-File: LICENSE
|
|
216
216
|
Requires-Dist: backoff==2.2.1
|
|
217
|
+
Requires-Dist: moviepy==2.2.1
|
|
217
218
|
Requires-Dist: pandas>=2.0
|
|
218
219
|
Requires-Dist: pydantic>2.0.0
|
|
219
220
|
Requires-Dist: pydantic-settings>2.0.0
|
|
221
|
+
Requires-Dist: fsspec>=2025.5.1
|
|
222
|
+
Requires-Dist: universal_pathlib>=0.2.6
|
|
223
|
+
Requires-Dist: ffmpeg-python==0.2.0
|
|
220
224
|
Requires-Dist: tritonclient
|
|
221
225
|
Dynamic: license-file
|
|
222
226
|
|
|
@@ -124,6 +124,8 @@ src/nv_ingest_api/util/converters/datetools.py
|
|
|
124
124
|
src/nv_ingest_api/util/converters/dftools.py
|
|
125
125
|
src/nv_ingest_api/util/converters/formats.py
|
|
126
126
|
src/nv_ingest_api/util/converters/type_mappings.py
|
|
127
|
+
src/nv_ingest_api/util/dataloader/__init__.py
|
|
128
|
+
src/nv_ingest_api/util/dataloader/dataloader.py
|
|
127
129
|
src/nv_ingest_api/util/detectors/__init__.py
|
|
128
130
|
src/nv_ingest_api/util/detectors/language.py
|
|
129
131
|
src/nv_ingest_api/util/exception_handlers/__init__.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|