nv-ingest-api 2025.5.18.dev20250518__tar.gz → 2025.5.20.dev20250520__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- {nv_ingest_api-2025.5.18.dev20250518/src/nv_ingest_api.egg-info → nv_ingest_api-2025.5.20.dev20250520}/PKG-INFO +1 -1
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +142 -86
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +170 -171
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/transform/split_text.py +9 -3
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/LICENSE +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/MANIFEST.in +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/README.md +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/pyproject.toml +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/setup.cfg +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/interface/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/interface/extract.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/interface/mutate.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/interface/store.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/interface/transform.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/interface/utility.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/enums/common.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/mutate/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/store/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/transform/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/internal/transform/embed_text.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/control_message/validators.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/converters/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/converters/containers.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/converters/datetools.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/converters/dftools.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/converters/formats.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/detectors/language.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/logging/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/logging/configuration.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/message_brokers/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/nim/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/schema/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/service_clients/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/system/__init__.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api/util/system/hardware_info.py +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api.egg-info/SOURCES.txt +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api.egg-info/requires.txt +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
- {nv_ingest_api-2025.5.18.dev20250518 → nv_ingest_api-2025.5.20.dev20250520}/src/version.py +0 -0
|
@@ -274,59 +274,70 @@ class DocxReader:
|
|
|
274
274
|
- A list of extracted images from the paragraph.
|
|
275
275
|
"""
|
|
276
276
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
277
|
+
try:
|
|
278
|
+
paragraph_images = []
|
|
279
|
+
if self.paragraph_format == "text":
|
|
280
|
+
return paragraph.text.strip(), paragraph_images
|
|
281
|
+
|
|
282
282
|
font = paragraph.style.font
|
|
283
283
|
default_style = (font.bold, font.italic, font.underline)
|
|
284
284
|
|
|
285
|
-
# Iterate over the runs of the paragraph and group them by style, excluding empty runs
|
|
286
285
|
paragraph_text = ""
|
|
287
286
|
group_text = ""
|
|
288
287
|
previous_style = None
|
|
289
288
|
|
|
290
289
|
for c in paragraph.iter_inner_content():
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
290
|
+
try:
|
|
291
|
+
if isinstance(c, Hyperlink):
|
|
292
|
+
text = f"[{c.text}]({c.address})"
|
|
293
|
+
style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
|
|
294
|
+
elif isinstance(c, Run):
|
|
295
|
+
text = c.text
|
|
296
|
+
style = (c.bold, c.italic, c.underline)
|
|
297
|
+
|
|
298
|
+
# 1. Locate the inline shape which is stored in the <w:drawing> element.
|
|
299
|
+
# 2. r:embed in <a.blip> has the relationship id for extracting the file where
|
|
300
|
+
# the image is stored as bytes.
|
|
301
|
+
# Reference:
|
|
302
|
+
# https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
|
|
303
|
+
inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
|
|
304
|
+
for r_id in inline_shapes:
|
|
305
|
+
text += self.image_tag.format(self.image_tag_index)
|
|
306
|
+
self.image_tag_index += 1
|
|
307
|
+
try:
|
|
308
|
+
image = paragraph.part.related_parts[r_id].image
|
|
309
|
+
paragraph_images.append(image)
|
|
310
|
+
except Exception as img_e:
|
|
311
|
+
logger.warning(
|
|
312
|
+
"Failed to extract image with rId " "%s: %s -- object / file may be malformed",
|
|
313
|
+
r_id,
|
|
314
|
+
img_e,
|
|
315
|
+
)
|
|
316
|
+
else:
|
|
317
|
+
continue
|
|
318
|
+
|
|
319
|
+
style = tuple(s if s is not None else d for s, d in zip(style, default_style))
|
|
320
|
+
|
|
321
|
+
if not self.is_text_empty(text) and previous_style is not None and style != previous_style:
|
|
316
322
|
paragraph_text += self.format_text(group_text, *previous_style)
|
|
317
323
|
group_text = ""
|
|
318
324
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
325
|
+
group_text += text
|
|
326
|
+
if not self.is_text_empty(text):
|
|
327
|
+
previous_style = style
|
|
322
328
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
329
|
+
except Exception as e:
|
|
330
|
+
logger.error("format_paragraph: failed to process run: %s", e)
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
if group_text and previous_style:
|
|
334
|
+
paragraph_text += self.format_text(group_text, *previous_style)
|
|
335
|
+
|
|
336
|
+
return paragraph_text.strip(), paragraph_images
|
|
326
337
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
338
|
+
except Exception as e:
|
|
339
|
+
logger.error("format_paragraph: failed for paragraph: %s", e)
|
|
340
|
+
return "", []
|
|
330
341
|
|
|
331
342
|
def format_cell(self, cell: "_Cell") -> Tuple[str, List["Image"]]:
|
|
332
343
|
"""
|
|
@@ -344,12 +355,23 @@ class DocxReader:
|
|
|
344
355
|
- A list of images extracted from the cell.
|
|
345
356
|
"""
|
|
346
357
|
|
|
347
|
-
|
|
348
|
-
newline = "<br>"
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
358
|
+
try:
|
|
359
|
+
newline = "<br>" if self.paragraph_format == "markdown" else "\n"
|
|
360
|
+
texts, images = [], []
|
|
361
|
+
|
|
362
|
+
for p in cell.paragraphs:
|
|
363
|
+
try:
|
|
364
|
+
t, imgs = self.format_paragraph(p)
|
|
365
|
+
texts.append(t)
|
|
366
|
+
images.extend(imgs)
|
|
367
|
+
except Exception as e:
|
|
368
|
+
logger.error("format_cell: failed to format paragraph in cell: %s", e)
|
|
369
|
+
|
|
370
|
+
return newline.join(texts), images
|
|
371
|
+
|
|
372
|
+
except Exception as e:
|
|
373
|
+
logger.error("format_cell: failed entirely: %s", e)
|
|
374
|
+
return "", []
|
|
353
375
|
|
|
354
376
|
def format_table(self, table: "Table") -> Tuple[Optional[str], List["Image"], DataFrame]:
|
|
355
377
|
"""
|
|
@@ -368,25 +390,50 @@ class DocxReader:
|
|
|
368
390
|
- A DataFrame representation of the table's content.
|
|
369
391
|
"""
|
|
370
392
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
393
|
+
try:
|
|
394
|
+
rows_data = []
|
|
395
|
+
all_images = []
|
|
396
|
+
|
|
397
|
+
for row in table.rows:
|
|
398
|
+
row_texts = []
|
|
399
|
+
row_images = []
|
|
400
|
+
for cell in row.cells:
|
|
401
|
+
try:
|
|
402
|
+
cell_text, cell_imgs = self.format_cell(cell)
|
|
403
|
+
row_texts.append(cell_text)
|
|
404
|
+
row_images.extend(cell_imgs)
|
|
405
|
+
except Exception as e:
|
|
406
|
+
logger.error("format_table: failed to process cell: %s", e)
|
|
407
|
+
row_texts.append("") # pad for column alignment
|
|
408
|
+
|
|
409
|
+
rows_data.append(row_texts)
|
|
410
|
+
all_images.extend(row_images)
|
|
411
|
+
|
|
412
|
+
if not rows_data or not rows_data[0]:
|
|
413
|
+
return None, [], pd.DataFrame()
|
|
414
|
+
|
|
415
|
+
header = rows_data[0]
|
|
416
|
+
body = rows_data[1:]
|
|
417
|
+
df = pd.DataFrame(body, columns=header) if body else pd.DataFrame(columns=header)
|
|
418
|
+
|
|
419
|
+
if "markdown" in self.table_format:
|
|
420
|
+
table_text = df.to_markdown(index=False)
|
|
421
|
+
if self.table_format == "markdown_light":
|
|
422
|
+
table_text = re.sub(r"\s{2,}", " ", table_text)
|
|
423
|
+
table_text = re.sub(r"-{2,}", "-", table_text)
|
|
424
|
+
elif self.table_format == "csv":
|
|
425
|
+
table_text = df.to_csv(index=False)
|
|
426
|
+
elif self.table_format == "tag":
|
|
427
|
+
table_text = self.table_tag.format(self.table_tag_index)
|
|
428
|
+
self.table_tag_index += 1
|
|
429
|
+
else:
|
|
430
|
+
raise ValueError(f"Unknown table format {self.table_format}")
|
|
431
|
+
|
|
432
|
+
return table_text, all_images, df
|
|
388
433
|
|
|
389
|
-
|
|
434
|
+
except Exception as e:
|
|
435
|
+
logger.error("format_table: failed to format table: %s", e)
|
|
436
|
+
return None, [], pd.DataFrame()
|
|
390
437
|
|
|
391
438
|
@staticmethod
|
|
392
439
|
def apply_text_style(style: str, text: str, level: int = 0) -> str:
|
|
@@ -841,30 +888,39 @@ class DocxReader:
|
|
|
841
888
|
self._prev_para_image_idx = 0
|
|
842
889
|
|
|
843
890
|
para_idx = 0
|
|
844
|
-
|
|
845
891
|
for child in self.document.element.body.iterchildren():
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
892
|
+
try:
|
|
893
|
+
if isinstance(child, CT_P):
|
|
894
|
+
paragraph = Paragraph(child, self.document)
|
|
895
|
+
paragraph_text, paragraph_images = self.format_paragraph(paragraph)
|
|
896
|
+
|
|
897
|
+
if extract_text:
|
|
898
|
+
try:
|
|
899
|
+
self._extract_para_text(
|
|
900
|
+
paragraph,
|
|
901
|
+
paragraph_text,
|
|
902
|
+
base_unified_metadata,
|
|
903
|
+
text_depth,
|
|
904
|
+
para_idx,
|
|
905
|
+
)
|
|
906
|
+
except Exception as e:
|
|
907
|
+
logger.error("extract_data: _extract_para_text failed: %s", e)
|
|
908
|
+
|
|
909
|
+
if (extract_images or extract_charts or extract_tables) and paragraph_images:
|
|
910
|
+
self._pending_images += [
|
|
911
|
+
(image, para_idx, "", base_unified_metadata) for image in paragraph_images
|
|
912
|
+
]
|
|
913
|
+
self.images.extend(paragraph_images)
|
|
914
|
+
|
|
915
|
+
elif isinstance(child, CT_Tbl):
|
|
916
|
+
if extract_tables or extract_charts:
|
|
917
|
+
try:
|
|
918
|
+
self._extract_table_data(child, base_unified_metadata)
|
|
919
|
+
except Exception as e:
|
|
920
|
+
logger.error("extract_data: _extract_table_data failed: %s", e)
|
|
864
921
|
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
self._extract_table_data(child, base_unified_metadata)
|
|
922
|
+
except Exception as e:
|
|
923
|
+
logger.error("extract_data: failed to process element at index %d: %s", para_idx, e)
|
|
868
924
|
|
|
869
925
|
para_idx += 1
|
|
870
926
|
|