nv-ingest-api 2025.10.8.dev20251008__tar.gz → 2025.10.10.dev20251010__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- {nv_ingest_api-2025.10.8.dev20251008/src/nv_ingest_api.egg-info → nv_ingest_api-2025.10.10.dev20251010}/PKG-INFO +1 -1
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +7 -3
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +7 -3
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/table_extractor.py +7 -3
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +6 -4
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +9 -2
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +44 -11
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +5 -1
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +9 -2
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
- nv_ingest_api-2025.10.10.dev20251010/src/udfs/llm_summarizer_udf.py +204 -0
- nv_ingest_api-2025.10.8.dev20251008/src/udfs/llm_summarizer_udf.py +0 -210
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/LICENSE +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/MANIFEST.in +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/README.md +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/pyproject.toml +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/setup.cfg +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/interface/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/interface/extract.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/interface/mutate.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/interface/store.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/interface/transform.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/interface/utility.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/enums/common.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/html/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/html/html_extractor.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/meta/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/meta/udf.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/mutate/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_html_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/meta/udf.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/store/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/transform/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/transform/embed_text.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/transform/split_text.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/control_message/validators.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/containers.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/datetools.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/dftools.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/formats.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/dataloader/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/dataloader/dataloader.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/detectors/language.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/imports/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/imports/callable_signatures.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/imports/dynamic_resolvers.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/introspection/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/introspection/class_inspect.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/introspection/function_inspect.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/logging/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/logging/configuration.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/logging/sanitize.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/message_brokers/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/nim/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/schema/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/string_processing/configuration.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/string_processing/yaml.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/system/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/system/hardware_info.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api.egg-info/SOURCES.txt +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api.egg-info/requires.txt +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/udfs/__init__.py +0 -0
- {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/version.py +0 -0
|
@@ -97,7 +97,7 @@ def _run_chart_inference(
|
|
|
97
97
|
model_name="paddle",
|
|
98
98
|
max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
|
|
99
99
|
)
|
|
100
|
-
elif ocr_model_name
|
|
100
|
+
elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}:
|
|
101
101
|
future_ocr_kwargs.update(
|
|
102
102
|
model_name=ocr_model_name,
|
|
103
103
|
input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
|
|
@@ -237,7 +237,9 @@ def _create_ocr_client(
|
|
|
237
237
|
auth_token: str,
|
|
238
238
|
) -> NimClient:
|
|
239
239
|
ocr_model_interface = (
|
|
240
|
-
NemoRetrieverOCRModelInterface()
|
|
240
|
+
NemoRetrieverOCRModelInterface()
|
|
241
|
+
if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}
|
|
242
|
+
else PaddleOCRModelInterface()
|
|
241
243
|
)
|
|
242
244
|
|
|
243
245
|
ocr_client = create_inference_client(
|
|
@@ -245,7 +247,9 @@ def _create_ocr_client(
|
|
|
245
247
|
model_interface=ocr_model_interface,
|
|
246
248
|
auth_token=auth_token,
|
|
247
249
|
infer_protocol=ocr_protocol,
|
|
248
|
-
enable_dynamic_batching=(
|
|
250
|
+
enable_dynamic_batching=(
|
|
251
|
+
True if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"} else False
|
|
252
|
+
),
|
|
249
253
|
dynamic_batch_memory_budget_mb=32,
|
|
250
254
|
)
|
|
251
255
|
|
|
@@ -107,7 +107,7 @@ def _update_infographic_metadata(
|
|
|
107
107
|
model_name="paddle",
|
|
108
108
|
max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
|
|
109
109
|
)
|
|
110
|
-
elif ocr_model_name
|
|
110
|
+
elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}:
|
|
111
111
|
infer_kwargs.update(
|
|
112
112
|
model_name=ocr_model_name,
|
|
113
113
|
input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
|
|
@@ -152,7 +152,9 @@ def _create_ocr_client(
|
|
|
152
152
|
auth_token: str,
|
|
153
153
|
) -> NimClient:
|
|
154
154
|
ocr_model_interface = (
|
|
155
|
-
NemoRetrieverOCRModelInterface()
|
|
155
|
+
NemoRetrieverOCRModelInterface()
|
|
156
|
+
if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}
|
|
157
|
+
else PaddleOCRModelInterface()
|
|
156
158
|
)
|
|
157
159
|
|
|
158
160
|
ocr_client = create_inference_client(
|
|
@@ -160,7 +162,9 @@ def _create_ocr_client(
|
|
|
160
162
|
model_interface=ocr_model_interface,
|
|
161
163
|
auth_token=auth_token,
|
|
162
164
|
infer_protocol=ocr_protocol,
|
|
163
|
-
enable_dynamic_batching=(
|
|
165
|
+
enable_dynamic_batching=(
|
|
166
|
+
True if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"} else False
|
|
167
|
+
),
|
|
164
168
|
dynamic_batch_memory_budget_mb=32,
|
|
165
169
|
)
|
|
166
170
|
|
|
@@ -99,7 +99,7 @@ def _run_inference(
|
|
|
99
99
|
model_name="paddle",
|
|
100
100
|
max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
|
|
101
101
|
)
|
|
102
|
-
elif ocr_model_name
|
|
102
|
+
elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}:
|
|
103
103
|
future_ocr_kwargs.update(
|
|
104
104
|
model_name=ocr_model_name,
|
|
105
105
|
input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
|
|
@@ -246,7 +246,9 @@ def _create_ocr_client(
|
|
|
246
246
|
auth_token: str,
|
|
247
247
|
) -> NimClient:
|
|
248
248
|
ocr_model_interface = (
|
|
249
|
-
NemoRetrieverOCRModelInterface()
|
|
249
|
+
NemoRetrieverOCRModelInterface()
|
|
250
|
+
if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}
|
|
251
|
+
else PaddleOCRModelInterface()
|
|
250
252
|
)
|
|
251
253
|
|
|
252
254
|
ocr_client = create_inference_client(
|
|
@@ -254,7 +256,9 @@ def _create_ocr_client(
|
|
|
254
256
|
model_interface=ocr_model_interface,
|
|
255
257
|
auth_token=auth_token,
|
|
256
258
|
infer_protocol=ocr_protocol,
|
|
257
|
-
enable_dynamic_batching=(
|
|
259
|
+
enable_dynamic_batching=(
|
|
260
|
+
True if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"} else False
|
|
261
|
+
),
|
|
258
262
|
dynamic_batch_memory_budget_mb=32,
|
|
259
263
|
)
|
|
260
264
|
|
|
@@ -332,6 +332,7 @@ def _extract_page_elements(
|
|
|
332
332
|
|
|
333
333
|
# Process each extracted element based on extraction flags
|
|
334
334
|
for page_idx, page_element in page_element_results:
|
|
335
|
+
page_reading_index = page_idx + 1
|
|
335
336
|
# Skip elements that shouldn't be extracted based on flags
|
|
336
337
|
if (not extract_tables) and (page_element.type_string == "table"):
|
|
337
338
|
continue
|
|
@@ -347,7 +348,7 @@ def _extract_page_elements(
|
|
|
347
348
|
# Construct metadata for the page element
|
|
348
349
|
page_element_meta = construct_page_element_metadata(
|
|
349
350
|
page_element,
|
|
350
|
-
|
|
351
|
+
page_reading_index,
|
|
351
352
|
page_count,
|
|
352
353
|
source_metadata,
|
|
353
354
|
base_unified_metadata,
|
|
@@ -473,6 +474,7 @@ def pdfium_extractor(
|
|
|
473
474
|
for page_idx in range(page_count):
|
|
474
475
|
page = doc.get_page(page_idx)
|
|
475
476
|
page_width, page_height = page.get_size()
|
|
477
|
+
page_reading_index = page_idx + 1
|
|
476
478
|
|
|
477
479
|
# Text extraction
|
|
478
480
|
if extract_text:
|
|
@@ -481,7 +483,7 @@ def pdfium_extractor(
|
|
|
481
483
|
text_meta = construct_text_metadata(
|
|
482
484
|
[page_text],
|
|
483
485
|
pdf_metadata.keywords,
|
|
484
|
-
|
|
486
|
+
page_reading_index,
|
|
485
487
|
-1,
|
|
486
488
|
-1,
|
|
487
489
|
-1,
|
|
@@ -499,7 +501,7 @@ def pdfium_extractor(
|
|
|
499
501
|
image_data = _extract_page_images(
|
|
500
502
|
extract_images_method,
|
|
501
503
|
page,
|
|
502
|
-
|
|
504
|
+
page_reading_index,
|
|
503
505
|
page_width,
|
|
504
506
|
page_height,
|
|
505
507
|
page_count,
|
|
@@ -518,7 +520,7 @@ def pdfium_extractor(
|
|
|
518
520
|
base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
|
|
519
521
|
image_meta = construct_image_metadata_from_base64(
|
|
520
522
|
base64_image,
|
|
521
|
-
|
|
523
|
+
page_reading_index,
|
|
522
524
|
page_count,
|
|
523
525
|
source_metadata,
|
|
524
526
|
base_unified_metadata,
|
|
@@ -21,7 +21,10 @@ from nv_ingest_api.internal.primitives.nim.model_interface.helpers import prepro
|
|
|
21
21
|
from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
|
|
22
22
|
|
|
23
23
|
DEFAULT_OCR_MODEL_NAME = "paddle"
|
|
24
|
-
NEMORETRIEVER_OCR_MODEL_NAME = "
|
|
24
|
+
NEMORETRIEVER_OCR_MODEL_NAME = "scene_text_wrapper"
|
|
25
|
+
NEMORETRIEVER_OCR_ENSEMBLE_MODEL_NAME = "scene_text_ensemble"
|
|
26
|
+
NEMORETRIEVER_OCR_BLS_MODEL_NAME = "scene_text_python"
|
|
27
|
+
|
|
25
28
|
|
|
26
29
|
logger = logging.getLogger(__name__)
|
|
27
30
|
|
|
@@ -231,7 +234,11 @@ class OCRModelInterfaceBase(ModelInterface):
|
|
|
231
234
|
if not isinstance(response, np.ndarray):
|
|
232
235
|
raise ValueError("Unexpected response format: response is not a NumPy array.")
|
|
233
236
|
|
|
234
|
-
if model_name
|
|
237
|
+
if model_name in [
|
|
238
|
+
NEMORETRIEVER_OCR_MODEL_NAME,
|
|
239
|
+
NEMORETRIEVER_OCR_ENSEMBLE_MODEL_NAME,
|
|
240
|
+
NEMORETRIEVER_OCR_BLS_MODEL_NAME,
|
|
241
|
+
]:
|
|
235
242
|
response = response.transpose((1, 0))
|
|
236
243
|
|
|
237
244
|
# If we have shape (3,), convert to (3, 1)
|
|
@@ -121,9 +121,6 @@ class NimClient:
|
|
|
121
121
|
if model_name == "yolox_ensemble":
|
|
122
122
|
model_name = "yolox"
|
|
123
123
|
|
|
124
|
-
if model_name == "scene_text_ensemble":
|
|
125
|
-
model_name = "scene_text_pre"
|
|
126
|
-
|
|
127
124
|
if model_name in self._max_batch_sizes:
|
|
128
125
|
return self._max_batch_sizes[model_name]
|
|
129
126
|
|
|
@@ -326,16 +323,52 @@ class NimClient:
|
|
|
326
323
|
|
|
327
324
|
outputs = [grpcclient.InferRequestedOutput(output_name) for output_name in output_names]
|
|
328
325
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
326
|
+
base_delay = 0.5
|
|
327
|
+
attempt = 0
|
|
328
|
+
retries_429 = 0
|
|
329
|
+
max_grpc_retries = self.max_429_retries
|
|
330
|
+
|
|
331
|
+
while attempt < self.max_retries:
|
|
332
|
+
try:
|
|
333
|
+
response = self.client.infer(
|
|
334
|
+
model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
|
|
335
|
+
)
|
|
332
336
|
|
|
333
|
-
|
|
337
|
+
logger.debug(f"gRPC inference response: {response}")
|
|
334
338
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
+
if len(outputs) == 1:
|
|
340
|
+
return response.as_numpy(outputs[0].name())
|
|
341
|
+
else:
|
|
342
|
+
return [response.as_numpy(output.name()) for output in outputs]
|
|
343
|
+
|
|
344
|
+
except grpcclient.InferenceServerException as e:
|
|
345
|
+
status = e.status()
|
|
346
|
+
if status == "StatusCode.UNAVAILABLE" and "Exceeds maximum queue size".lower() in e.message().lower():
|
|
347
|
+
retries_429 += 1
|
|
348
|
+
logger.warning(
|
|
349
|
+
f"Received gRPC {status} for model '{model_name}'. "
|
|
350
|
+
f"Attempt {retries_429} of {max_grpc_retries}."
|
|
351
|
+
)
|
|
352
|
+
if retries_429 >= max_grpc_retries:
|
|
353
|
+
logger.error(f"Max retries for gRPC {status} exceeded for model '{model_name}'.")
|
|
354
|
+
raise
|
|
355
|
+
|
|
356
|
+
backoff_time = base_delay * (2**retries_429)
|
|
357
|
+
time.sleep(backoff_time)
|
|
358
|
+
continue
|
|
359
|
+
|
|
360
|
+
else:
|
|
361
|
+
# For other server-side errors (e.g., INVALID_ARGUMENT, NOT_FOUND),
|
|
362
|
+
# retrying will not help. We should fail fast.
|
|
363
|
+
logger.error(
|
|
364
|
+
f"Received non-retryable gRPC error from Triton for model '{model_name}': {e.message()}"
|
|
365
|
+
)
|
|
366
|
+
raise
|
|
367
|
+
|
|
368
|
+
except Exception as e:
|
|
369
|
+
# Catch any other unexpected exceptions (e.g., network issues not caught by Triton client)
|
|
370
|
+
logger.error(f"An unexpected error occurred during gRPC inference for model '{model_name}': {e}")
|
|
371
|
+
raise
|
|
339
372
|
|
|
340
373
|
def _http_infer(self, formatted_input: dict) -> dict:
|
|
341
374
|
"""
|
|
@@ -24,8 +24,12 @@ logger = logging.getLogger(__name__)
|
|
|
24
24
|
# Tracing Options Schema
|
|
25
25
|
class TracingOptionsSchema(BaseModelNoExt):
|
|
26
26
|
trace: bool = False
|
|
27
|
-
ts_send: int
|
|
27
|
+
ts_send: Optional[int] = None
|
|
28
28
|
trace_id: Optional[str] = None
|
|
29
|
+
# V2 PDF splitting support
|
|
30
|
+
parent_job_id: Optional[str] = None
|
|
31
|
+
page_num: Optional[int] = None
|
|
32
|
+
total_pages: Optional[int] = None
|
|
29
33
|
|
|
30
34
|
|
|
31
35
|
# Ingest Task Schemas
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import os
|
|
6
7
|
import re
|
|
7
8
|
import time
|
|
8
9
|
from typing import Any, Union, Tuple, Optional, Dict, Callable
|
|
@@ -137,13 +138,19 @@ class RestClient(MessageBrokerClientBase):
|
|
|
137
138
|
)
|
|
138
139
|
self._client = requests.Session()
|
|
139
140
|
|
|
140
|
-
|
|
141
|
-
|
|
141
|
+
# Allow API version override via environment variable or kwargs
|
|
142
|
+
api_version = kwargs.get("api_version") or os.getenv("NV_INGEST_API_VERSION", "v1")
|
|
143
|
+
self._api_version = api_version
|
|
144
|
+
self._submit_endpoint: str = f"/{api_version}/submit_job"
|
|
145
|
+
self._fetch_endpoint: str = f"/{api_version}/fetch_job"
|
|
142
146
|
self._base_url: str = kwargs.get("base_url") or self._generate_url(self._host, self._port)
|
|
143
147
|
self._headers = kwargs.get("headers", {})
|
|
144
148
|
self._auth = kwargs.get("auth", None)
|
|
145
149
|
|
|
146
150
|
logger.debug(f"RestClient base URL set to: {self._base_url}")
|
|
151
|
+
logger.info(
|
|
152
|
+
f"RestClient using API version: {api_version} (endpoints: {self._submit_endpoint}, {self._fetch_endpoint})"
|
|
153
|
+
)
|
|
147
154
|
|
|
148
155
|
@staticmethod
|
|
149
156
|
def _generate_url(host: str, port: int) -> str:
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
LLM Content Summarizer UDF for NV-Ingest Pipeline
|
|
4
|
+
|
|
5
|
+
This UDF uses an LLM to generate concise summaries of text content chunks. These summaries are added to the metadata
|
|
6
|
+
for enhanced downstream processing and search capabilities.
|
|
7
|
+
|
|
8
|
+
These variables can be set in the environment before running the pipeline. These can be treated as kwargs.
|
|
9
|
+
- NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
|
|
10
|
+
- LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/llama-3.1-nemotron-70b-instruct)
|
|
11
|
+
- LLM_BASE_URL: base URL (default: https://integrate.api.nvidia.com/v1)
|
|
12
|
+
- TIMEOUT: API timeout in seconds (default: 60)
|
|
13
|
+
- MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
|
|
14
|
+
- MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
|
|
15
|
+
TODO: Implement this
|
|
16
|
+
- NUM_CHUNKS: (Optional) Number of first and last pages to summarize. default=1
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import os
|
|
21
|
+
import time
|
|
22
|
+
|
|
23
|
+
# REMOVE BEFORE MERGING
|
|
24
|
+
# import yaml
|
|
25
|
+
# from pathlib import Path
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
PROMPT = """
|
|
31
|
+
Here are the contents from the first and last page of a document. Focus on the main purpose, key topics,
|
|
32
|
+
and important details. Just return the summary as a paragraph. Do not add special characters for formatting.
|
|
33
|
+
This summary will be used for document search and understanding.
|
|
34
|
+
|
|
35
|
+
[CONTENT]
|
|
36
|
+
{content}
|
|
37
|
+
[END CONTENT]
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage": # noqa: F821
|
|
42
|
+
"""
|
|
43
|
+
UDF function that adds LLM-generated summaries to text content chunks.
|
|
44
|
+
|
|
45
|
+
This function processes text primitives and generates concise summaries using
|
|
46
|
+
an LLM API, storing the results in the metadata's custom_content field.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
control_message : IngestControlMessage
|
|
51
|
+
The control message containing the DataFrame payload with text content
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
IngestControlMessage
|
|
56
|
+
The modified control message with LLM summaries added to metadata
|
|
57
|
+
"""
|
|
58
|
+
logger.info("UDF: Starting LLM content summarization")
|
|
59
|
+
|
|
60
|
+
api_key = os.getenv("NVIDIA_API_KEY")
|
|
61
|
+
model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/llama-3.1-nemotron-70b-instruct")
|
|
62
|
+
base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
|
|
63
|
+
min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", 50))
|
|
64
|
+
max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", 12000))
|
|
65
|
+
timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", 60))
|
|
66
|
+
|
|
67
|
+
stats = {
|
|
68
|
+
"skipped": False,
|
|
69
|
+
"failed": False,
|
|
70
|
+
"tokens": 0,
|
|
71
|
+
"duration": 0.0,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if not api_key:
|
|
75
|
+
logger.error("NVIDIA_API_KEY not set. Skipping...")
|
|
76
|
+
return control_message
|
|
77
|
+
|
|
78
|
+
df = control_message.payload()
|
|
79
|
+
|
|
80
|
+
if df is None or df.empty:
|
|
81
|
+
logger.warning("No payload found. Nothing to summarize.")
|
|
82
|
+
return control_message
|
|
83
|
+
|
|
84
|
+
# Select first and last chunk for summarization
|
|
85
|
+
# According to docs/docs/extraction/user_defined_functions.md#understanding-the-dataframe-payload
|
|
86
|
+
# the rows are not necessarily pages. they are chunks of data extracted from the document. in order to select
|
|
87
|
+
# pages, it must require parsing the payload to see which chunks correspond to which pages
|
|
88
|
+
if len(df) > 1:
|
|
89
|
+
# TODO: add feature to select N first and last chunks
|
|
90
|
+
df = df.iloc[[0, -1]]
|
|
91
|
+
else:
|
|
92
|
+
logger.info("Document has only one chunk")
|
|
93
|
+
|
|
94
|
+
# Combine all content into a single string
|
|
95
|
+
content_list = df.apply(
|
|
96
|
+
_extract_content,
|
|
97
|
+
axis=1,
|
|
98
|
+
min_content_length=min_content_length,
|
|
99
|
+
max_content_length=max_content_length,
|
|
100
|
+
stats=stats,
|
|
101
|
+
)
|
|
102
|
+
content = " ".join(content_list)
|
|
103
|
+
|
|
104
|
+
# Nicely ask LLM to summarize content
|
|
105
|
+
summary, stats["duration"] = _generate_llm_summary(content, model_name, base_url, api_key, timeout)
|
|
106
|
+
|
|
107
|
+
stats["failed"] = summary is None
|
|
108
|
+
if not stats["failed"]:
|
|
109
|
+
stats["tokens"] = _estimate_tokens(content)
|
|
110
|
+
logger.info("Summarized %d tokens in %f seconds using %s", stats["tokens"], stats["duration"], model_name)
|
|
111
|
+
_store_summary(df, summary, model_name)
|
|
112
|
+
|
|
113
|
+
# Update the control message with modified DataFrame
|
|
114
|
+
control_message.payload(df)
|
|
115
|
+
else:
|
|
116
|
+
logger.warning("%s failed to summarize content", model_name)
|
|
117
|
+
|
|
118
|
+
return control_message
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _extract_content(row, stats: dict, min_content_length: int = 50, max_content_length: int = 12000) -> str | None:
|
|
122
|
+
"""Extract text content from row"""
|
|
123
|
+
metadata = row.get("metadata")
|
|
124
|
+
|
|
125
|
+
if isinstance(metadata, dict):
|
|
126
|
+
content = metadata.get("content")
|
|
127
|
+
if content is not None:
|
|
128
|
+
content = content.strip()
|
|
129
|
+
if len(content) < min_content_length:
|
|
130
|
+
stats["skipped"] = True
|
|
131
|
+
logger.warning(f"Content less than min={min_content_length}. Skipping...")
|
|
132
|
+
content = ""
|
|
133
|
+
elif len(content) > max_content_length:
|
|
134
|
+
logger.warning(f"Truncating content to {max_content_length} characters")
|
|
135
|
+
content = content[:max_content_length]
|
|
136
|
+
else:
|
|
137
|
+
stats["skipped"] = True
|
|
138
|
+
content = ""
|
|
139
|
+
|
|
140
|
+
else:
|
|
141
|
+
stats["skipped"] = True
|
|
142
|
+
logger.warning("No metadata found. Skipping...")
|
|
143
|
+
content = ""
|
|
144
|
+
|
|
145
|
+
return content
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _generate_llm_summary(
|
|
149
|
+
content: str,
|
|
150
|
+
model_name: str,
|
|
151
|
+
base_url: str,
|
|
152
|
+
api_key: str,
|
|
153
|
+
timeout: int,
|
|
154
|
+
) -> tuple[str | None, float]:
|
|
155
|
+
"""Ask an LLM to summarize content extracted from doc."""
|
|
156
|
+
|
|
157
|
+
start_time = time.time()
|
|
158
|
+
try:
|
|
159
|
+
from openai import OpenAI
|
|
160
|
+
|
|
161
|
+
client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
|
|
162
|
+
start_time = time.time()
|
|
163
|
+
completion = client.chat.completions.create(
|
|
164
|
+
model=model_name,
|
|
165
|
+
messages=[{"role": "user", "content": PROMPT.format(content=content)}],
|
|
166
|
+
max_tokens=400, # Increased for more comprehensive summaries
|
|
167
|
+
temperature=0.7,
|
|
168
|
+
)
|
|
169
|
+
duration = time.time() - start_time
|
|
170
|
+
|
|
171
|
+
if completion.choices:
|
|
172
|
+
summary = completion.choices[0].message.content.strip()
|
|
173
|
+
return summary, duration
|
|
174
|
+
return None, duration
|
|
175
|
+
|
|
176
|
+
except Exception as e:
|
|
177
|
+
logger.error(f"API call failed: {e}")
|
|
178
|
+
# TODO: GitHub Thread
|
|
179
|
+
# Reviewers, tell me if this is a bad idea.
|
|
180
|
+
# I think the convention is to return timestamp for time even if it fails
|
|
181
|
+
return None, time.time() - start_time
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _store_summary(df, summary: str, model_name: str):
|
|
185
|
+
"""Add summary to metadata and store in df"""
|
|
186
|
+
# hardcoded heuristic to store everything on chunk 0's metadata
|
|
187
|
+
row_0 = df.iloc[0]
|
|
188
|
+
|
|
189
|
+
# this is a reference to a dictionary that is stored in the dataframe
|
|
190
|
+
# and is modified in place
|
|
191
|
+
metadata = row_0.get("metadata")
|
|
192
|
+
|
|
193
|
+
if metadata.get("custom_content") is None:
|
|
194
|
+
metadata["custom_content"] = {}
|
|
195
|
+
metadata["custom_content"]["llm_summarizer_udf"] = {"summary": summary, "model": model_name}
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _estimate_tokens(text: str) -> int:
|
|
199
|
+
"""Rough estimate (~4 characters per token)"""
|
|
200
|
+
return len(text) // 4
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _safe_model_name(name: str) -> str:
|
|
204
|
+
return name.replace("/", "__").replace("-", "_")
|