nv-ingest-api 2025.10.7.dev20251007__tar.gz → 2025.10.9.dev20251009__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- {nv_ingest_api-2025.10.7.dev20251007/src/nv_ingest_api.egg-info → nv_ingest_api-2025.10.9.dev20251009}/PKG-INFO +1 -1
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +6 -4
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +44 -8
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +12 -1
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
- nv_ingest_api-2025.10.9.dev20251009/src/udfs/llm_summarizer_udf.py +204 -0
- nv_ingest_api-2025.10.7.dev20251007/src/udfs/llm_summarizer_udf.py +0 -210
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/LICENSE +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/MANIFEST.in +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/README.md +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/pyproject.toml +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/setup.cfg +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/interface/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/interface/extract.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/interface/mutate.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/interface/store.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/interface/transform.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/interface/utility.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/enums/common.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/html/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/html/html_extractor.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/meta/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/meta/udf.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/mutate/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_html_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/meta/udf.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/store/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/transform/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/transform/embed_text.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/internal/transform/split_text.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/control_message/validators.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/containers.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/datetools.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/dftools.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/formats.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/dataloader/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/dataloader/dataloader.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/detectors/language.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/imports/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/imports/callable_signatures.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/imports/dynamic_resolvers.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/introspection/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/introspection/class_inspect.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/introspection/function_inspect.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/logging/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/logging/configuration.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/logging/sanitize.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/message_brokers/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/nim/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/schema/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/string_processing/configuration.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/string_processing/yaml.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/system/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api/util/system/hardware_info.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api.egg-info/SOURCES.txt +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api.egg-info/requires.txt +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/udfs/__init__.py +0 -0
- {nv_ingest_api-2025.10.7.dev20251007 → nv_ingest_api-2025.10.9.dev20251009}/src/version.py +0 -0
|
@@ -332,6 +332,7 @@ def _extract_page_elements(
|
|
|
332
332
|
|
|
333
333
|
# Process each extracted element based on extraction flags
|
|
334
334
|
for page_idx, page_element in page_element_results:
|
|
335
|
+
page_reading_index = page_idx + 1
|
|
335
336
|
# Skip elements that shouldn't be extracted based on flags
|
|
336
337
|
if (not extract_tables) and (page_element.type_string == "table"):
|
|
337
338
|
continue
|
|
@@ -347,7 +348,7 @@ def _extract_page_elements(
|
|
|
347
348
|
# Construct metadata for the page element
|
|
348
349
|
page_element_meta = construct_page_element_metadata(
|
|
349
350
|
page_element,
|
|
350
|
-
|
|
351
|
+
page_reading_index,
|
|
351
352
|
page_count,
|
|
352
353
|
source_metadata,
|
|
353
354
|
base_unified_metadata,
|
|
@@ -473,6 +474,7 @@ def pdfium_extractor(
|
|
|
473
474
|
for page_idx in range(page_count):
|
|
474
475
|
page = doc.get_page(page_idx)
|
|
475
476
|
page_width, page_height = page.get_size()
|
|
477
|
+
page_reading_index = page_idx + 1
|
|
476
478
|
|
|
477
479
|
# Text extraction
|
|
478
480
|
if extract_text:
|
|
@@ -481,7 +483,7 @@ def pdfium_extractor(
|
|
|
481
483
|
text_meta = construct_text_metadata(
|
|
482
484
|
[page_text],
|
|
483
485
|
pdf_metadata.keywords,
|
|
484
|
-
|
|
486
|
+
page_reading_index,
|
|
485
487
|
-1,
|
|
486
488
|
-1,
|
|
487
489
|
-1,
|
|
@@ -499,7 +501,7 @@ def pdfium_extractor(
|
|
|
499
501
|
image_data = _extract_page_images(
|
|
500
502
|
extract_images_method,
|
|
501
503
|
page,
|
|
502
|
-
|
|
504
|
+
page_reading_index,
|
|
503
505
|
page_width,
|
|
504
506
|
page_height,
|
|
505
507
|
page_count,
|
|
@@ -518,7 +520,7 @@ def pdfium_extractor(
|
|
|
518
520
|
base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
|
|
519
521
|
image_meta = construct_image_metadata_from_base64(
|
|
520
522
|
base64_image,
|
|
521
|
-
|
|
523
|
+
page_reading_index,
|
|
522
524
|
page_count,
|
|
523
525
|
source_metadata,
|
|
524
526
|
base_unified_metadata,
|
|
@@ -326,16 +326,52 @@ class NimClient:
|
|
|
326
326
|
|
|
327
327
|
outputs = [grpcclient.InferRequestedOutput(output_name) for output_name in output_names]
|
|
328
328
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
329
|
+
base_delay = 0.5
|
|
330
|
+
attempt = 0
|
|
331
|
+
retries_429 = 0
|
|
332
|
+
max_grpc_retries = self.max_429_retries
|
|
332
333
|
|
|
333
|
-
|
|
334
|
+
while attempt < self.max_retries:
|
|
335
|
+
try:
|
|
336
|
+
response = self.client.infer(
|
|
337
|
+
model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
|
|
338
|
+
)
|
|
334
339
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
340
|
+
logger.debug(f"gRPC inference response: {response}")
|
|
341
|
+
|
|
342
|
+
if len(outputs) == 1:
|
|
343
|
+
return response.as_numpy(outputs[0].name())
|
|
344
|
+
else:
|
|
345
|
+
return [response.as_numpy(output.name()) for output in outputs]
|
|
346
|
+
|
|
347
|
+
except grpcclient.InferenceServerException as e:
|
|
348
|
+
status = e.status()
|
|
349
|
+
if status == "StatusCode.UNAVAILABLE" and "Exceeds maximum queue size".lower() in e.message().lower():
|
|
350
|
+
retries_429 += 1
|
|
351
|
+
logger.warning(
|
|
352
|
+
f"Received gRPC {status} for model '{model_name}'. "
|
|
353
|
+
f"Attempt {retries_429} of {max_grpc_retries}."
|
|
354
|
+
)
|
|
355
|
+
if retries_429 >= max_grpc_retries:
|
|
356
|
+
logger.error(f"Max retries for gRPC {status} exceeded for model '{model_name}'.")
|
|
357
|
+
raise
|
|
358
|
+
|
|
359
|
+
backoff_time = base_delay * (2**retries_429)
|
|
360
|
+
time.sleep(backoff_time)
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
else:
|
|
364
|
+
# For other server-side errors (e.g., INVALID_ARGUMENT, NOT_FOUND),
|
|
365
|
+
# retrying will not help. We should fail fast.
|
|
366
|
+
logger.error(
|
|
367
|
+
f"Received non-retryable gRPC error from Triton for model '{model_name}': {e.message()}"
|
|
368
|
+
)
|
|
369
|
+
raise
|
|
370
|
+
|
|
371
|
+
except Exception as e:
|
|
372
|
+
# Catch any other unexpected exceptions (e.g., network issues not caught by Triton client)
|
|
373
|
+
logger.error(f"An unexpected error occurred during gRPC inference for model '{model_name}': {e}")
|
|
374
|
+
raise
|
|
339
375
|
|
|
340
376
|
def _http_infer(self, formatted_input: dict) -> dict:
|
|
341
377
|
"""
|
|
@@ -308,7 +308,18 @@ class RestClient(MessageBrokerClientBase):
|
|
|
308
308
|
|
|
309
309
|
retries: int = 0
|
|
310
310
|
url: str = f"{self._base_url}{self._fetch_endpoint}/{job_id}"
|
|
311
|
-
|
|
311
|
+
# Derive per-call timeout if provided; otherwise use default
|
|
312
|
+
if timeout is None:
|
|
313
|
+
req_timeout: Tuple[float, Optional[float]] = self._timeout
|
|
314
|
+
else:
|
|
315
|
+
if isinstance(timeout, tuple):
|
|
316
|
+
# Expect (connect, read)
|
|
317
|
+
connect_t = float(timeout[0])
|
|
318
|
+
read_t = None if (len(timeout) < 2 or timeout[1] is None) else float(timeout[1])
|
|
319
|
+
req_timeout = (connect_t, read_t)
|
|
320
|
+
else:
|
|
321
|
+
# Single float means override read timeout, keep a small connect timeout
|
|
322
|
+
req_timeout = (min(self._default_connect_timeout, 5.0), float(timeout))
|
|
312
323
|
|
|
313
324
|
while True:
|
|
314
325
|
result: Optional[Any] = None
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
LLM Content Summarizer UDF for NV-Ingest Pipeline
|
|
4
|
+
|
|
5
|
+
This UDF uses an LLM to generate concise summaries of text content chunks. These summaries are added to the metadata
|
|
6
|
+
for enhanced downstream processing and search capabilities.
|
|
7
|
+
|
|
8
|
+
These variables can be set in the environment before running the pipeline. These can be treated as kwargs.
|
|
9
|
+
- NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
|
|
10
|
+
- LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/llama-3.1-nemotron-70b-instruct)
|
|
11
|
+
- LLM_BASE_URL: base URL (default: https://integrate.api.nvidia.com/v1)
|
|
12
|
+
- TIMEOUT: API timeout in seconds (default: 60)
|
|
13
|
+
- MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
|
|
14
|
+
- MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
|
|
15
|
+
TODO: Implement this
|
|
16
|
+
- NUM_CHUNKS: (Optional) Number of first and last pages to summarize. default=1
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import os
|
|
21
|
+
import time
|
|
22
|
+
|
|
23
|
+
# REMOVE BEFORE MERGING
|
|
24
|
+
# import yaml
|
|
25
|
+
# from pathlib import Path
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
PROMPT = """
|
|
31
|
+
Here are the contents from the first and last page of a document. Focus on the main purpose, key topics,
|
|
32
|
+
and important details. Just return the summary as a paragraph. Do not add special characters for formatting.
|
|
33
|
+
This summary will be used for document search and understanding.
|
|
34
|
+
|
|
35
|
+
[CONTENT]
|
|
36
|
+
{content}
|
|
37
|
+
[END CONTENT]
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage": # noqa: F821
|
|
42
|
+
"""
|
|
43
|
+
UDF function that adds LLM-generated summaries to text content chunks.
|
|
44
|
+
|
|
45
|
+
This function processes text primitives and generates concise summaries using
|
|
46
|
+
an LLM API, storing the results in the metadata's custom_content field.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
control_message : IngestControlMessage
|
|
51
|
+
The control message containing the DataFrame payload with text content
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
IngestControlMessage
|
|
56
|
+
The modified control message with LLM summaries added to metadata
|
|
57
|
+
"""
|
|
58
|
+
logger.info("UDF: Starting LLM content summarization")
|
|
59
|
+
|
|
60
|
+
api_key = os.getenv("NVIDIA_API_KEY")
|
|
61
|
+
model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/llama-3.1-nemotron-70b-instruct")
|
|
62
|
+
base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
|
|
63
|
+
min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", 50))
|
|
64
|
+
max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", 12000))
|
|
65
|
+
timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", 60))
|
|
66
|
+
|
|
67
|
+
stats = {
|
|
68
|
+
"skipped": False,
|
|
69
|
+
"failed": False,
|
|
70
|
+
"tokens": 0,
|
|
71
|
+
"duration": 0.0,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if not api_key:
|
|
75
|
+
logger.error("NVIDIA_API_KEY not set. Skipping...")
|
|
76
|
+
return control_message
|
|
77
|
+
|
|
78
|
+
df = control_message.payload()
|
|
79
|
+
|
|
80
|
+
if df is None or df.empty:
|
|
81
|
+
logger.warning("No payload found. Nothing to summarize.")
|
|
82
|
+
return control_message
|
|
83
|
+
|
|
84
|
+
# Select first and last chunk for summarization
|
|
85
|
+
# According to docs/docs/extraction/user_defined_functions.md#understanding-the-dataframe-payload
|
|
86
|
+
# the rows are not necessarily pages. they are chunks of data extracted from the document. in order to select
|
|
87
|
+
# pages, it must require parsing the payload to see which chunks correspond to which pages
|
|
88
|
+
if len(df) > 1:
|
|
89
|
+
# TODO: add feature to select N first and last chunks
|
|
90
|
+
df = df.iloc[[0, -1]]
|
|
91
|
+
else:
|
|
92
|
+
logger.info("Document has only one chunk")
|
|
93
|
+
|
|
94
|
+
# Combine all content into a single string
|
|
95
|
+
content_list = df.apply(
|
|
96
|
+
_extract_content,
|
|
97
|
+
axis=1,
|
|
98
|
+
min_content_length=min_content_length,
|
|
99
|
+
max_content_length=max_content_length,
|
|
100
|
+
stats=stats,
|
|
101
|
+
)
|
|
102
|
+
content = " ".join(content_list)
|
|
103
|
+
|
|
104
|
+
# Nicely ask LLM to summarize content
|
|
105
|
+
summary, stats["duration"] = _generate_llm_summary(content, model_name, base_url, api_key, timeout)
|
|
106
|
+
|
|
107
|
+
stats["failed"] = summary is None
|
|
108
|
+
if not stats["failed"]:
|
|
109
|
+
stats["tokens"] = _estimate_tokens(content)
|
|
110
|
+
logger.info("Summarized %d tokens in %f seconds using %s", stats["tokens"], stats["duration"], model_name)
|
|
111
|
+
_store_summary(df, summary, model_name)
|
|
112
|
+
|
|
113
|
+
# Update the control message with modified DataFrame
|
|
114
|
+
control_message.payload(df)
|
|
115
|
+
else:
|
|
116
|
+
logger.warning("%s failed to summarize content", model_name)
|
|
117
|
+
|
|
118
|
+
return control_message
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _extract_content(row, stats: dict, min_content_length: int = 50, max_content_length: int = 12000) -> str | None:
|
|
122
|
+
"""Extract text content from row"""
|
|
123
|
+
metadata = row.get("metadata")
|
|
124
|
+
|
|
125
|
+
if isinstance(metadata, dict):
|
|
126
|
+
content = metadata.get("content")
|
|
127
|
+
if content is not None:
|
|
128
|
+
content = content.strip()
|
|
129
|
+
if len(content) < min_content_length:
|
|
130
|
+
stats["skipped"] = True
|
|
131
|
+
logger.warning(f"Content less than min={min_content_length}. Skipping...")
|
|
132
|
+
content = ""
|
|
133
|
+
elif len(content) > max_content_length:
|
|
134
|
+
logger.warning(f"Truncating content to {max_content_length} characters")
|
|
135
|
+
content = content[:max_content_length]
|
|
136
|
+
else:
|
|
137
|
+
stats["skipped"] = True
|
|
138
|
+
content = ""
|
|
139
|
+
|
|
140
|
+
else:
|
|
141
|
+
stats["skipped"] = True
|
|
142
|
+
logger.warning("No metadata found. Skipping...")
|
|
143
|
+
content = ""
|
|
144
|
+
|
|
145
|
+
return content
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _generate_llm_summary(
|
|
149
|
+
content: str,
|
|
150
|
+
model_name: str,
|
|
151
|
+
base_url: str,
|
|
152
|
+
api_key: str,
|
|
153
|
+
timeout: int,
|
|
154
|
+
) -> tuple[str | None, float]:
|
|
155
|
+
"""Ask an LLM to summarize content extracted from doc."""
|
|
156
|
+
|
|
157
|
+
start_time = time.time()
|
|
158
|
+
try:
|
|
159
|
+
from openai import OpenAI
|
|
160
|
+
|
|
161
|
+
client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
|
|
162
|
+
start_time = time.time()
|
|
163
|
+
completion = client.chat.completions.create(
|
|
164
|
+
model=model_name,
|
|
165
|
+
messages=[{"role": "user", "content": PROMPT.format(content=content)}],
|
|
166
|
+
max_tokens=400, # Increased for more comprehensive summaries
|
|
167
|
+
temperature=0.7,
|
|
168
|
+
)
|
|
169
|
+
duration = time.time() - start_time
|
|
170
|
+
|
|
171
|
+
if completion.choices:
|
|
172
|
+
summary = completion.choices[0].message.content.strip()
|
|
173
|
+
return summary, duration
|
|
174
|
+
return None, duration
|
|
175
|
+
|
|
176
|
+
except Exception as e:
|
|
177
|
+
logger.error(f"API call failed: {e}")
|
|
178
|
+
# TODO: GitHub Thread
|
|
179
|
+
# Reviewers, tell me if this is a bad idea.
|
|
180
|
+
# I think the convention is to return timestamp for time even if it fails
|
|
181
|
+
return None, time.time() - start_time
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _store_summary(df, summary: str, model_name: str):
|
|
185
|
+
"""Add summary to metadata and store in df"""
|
|
186
|
+
# hardcoded heuristic to store everything on chunk 0's metadata
|
|
187
|
+
row_0 = df.iloc[0]
|
|
188
|
+
|
|
189
|
+
# this is a reference to a dictionary that is stored in the dataframe
|
|
190
|
+
# and is modified in place
|
|
191
|
+
metadata = row_0.get("metadata")
|
|
192
|
+
|
|
193
|
+
if metadata.get("custom_content") is None:
|
|
194
|
+
metadata["custom_content"] = {}
|
|
195
|
+
metadata["custom_content"]["llm_summarizer_udf"] = {"summary": summary, "model": model_name}
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _estimate_tokens(text: str) -> int:
|
|
199
|
+
"""Rough estimate (~4 characters per token)"""
|
|
200
|
+
return len(text) // 4
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _safe_model_name(name: str) -> str:
|
|
204
|
+
return name.replace("/", "__").replace("-", "_")
|
|
@@ -1,210 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
LLM Content Summarizer UDF for NV-Ingest Pipeline
|
|
4
|
-
|
|
5
|
-
This UDF uses an LLM API to generate concise summaries
|
|
6
|
-
of text content chunks, adding AI-generated summaries to the metadata for
|
|
7
|
-
enhanced downstream processing and search capabilities.
|
|
8
|
-
|
|
9
|
-
Environment Variables:
|
|
10
|
-
- NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
|
|
11
|
-
- LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/llama-3.1-nemotron-70b-instruct)
|
|
12
|
-
- LLM_SUMMARIZATION_BASE_URL: API base URL (default: https://integrate.api.nvidia.com/v1)
|
|
13
|
-
- LLM_SUMMARIZATION_TIMEOUT: API timeout in seconds (default: 60)
|
|
14
|
-
- LLM_MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
|
|
15
|
-
- LLM_MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
import os
|
|
19
|
-
import logging
|
|
20
|
-
from typing import Optional
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage": # noqa: F821
|
|
24
|
-
"""
|
|
25
|
-
UDF function that adds LLM-generated summaries to text content chunks.
|
|
26
|
-
|
|
27
|
-
This function processes text primitives and generates concise summaries using
|
|
28
|
-
an LLM API, storing the results in the metadata's custom_content field.
|
|
29
|
-
|
|
30
|
-
Features:
|
|
31
|
-
- Flexible content detection across multiple metadata locations
|
|
32
|
-
- Robust error handling with graceful fallbacks
|
|
33
|
-
- Comprehensive logging for monitoring and debugging
|
|
34
|
-
- Configurable content length thresholds
|
|
35
|
-
- Safe metadata manipulation preserving existing data
|
|
36
|
-
|
|
37
|
-
Parameters
|
|
38
|
-
----------
|
|
39
|
-
control_message : IngestControlMessage
|
|
40
|
-
The control message containing the DataFrame payload with text content
|
|
41
|
-
|
|
42
|
-
Returns
|
|
43
|
-
-------
|
|
44
|
-
IngestControlMessage
|
|
45
|
-
The modified control message with LLM summaries added to metadata
|
|
46
|
-
"""
|
|
47
|
-
from openai import OpenAI
|
|
48
|
-
|
|
49
|
-
logger = logging.getLogger(__name__)
|
|
50
|
-
logger.info("UDF: Starting LLM content summarization")
|
|
51
|
-
|
|
52
|
-
# Get configuration from environment
|
|
53
|
-
api_key = os.getenv("NVIDIA_API_KEY", "")
|
|
54
|
-
model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/llama-3.1-nemotron-70b-instruct")
|
|
55
|
-
base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
|
|
56
|
-
timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", "60"))
|
|
57
|
-
min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", "50"))
|
|
58
|
-
max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", "12000"))
|
|
59
|
-
|
|
60
|
-
if not api_key:
|
|
61
|
-
logger.warning("NVIDIA_API_KEY not found, skipping summarization")
|
|
62
|
-
return control_message
|
|
63
|
-
|
|
64
|
-
# Get the DataFrame payload
|
|
65
|
-
df = control_message.payload()
|
|
66
|
-
if df is None or len(df) == 0:
|
|
67
|
-
logger.warning("No payload found in control message")
|
|
68
|
-
return control_message
|
|
69
|
-
|
|
70
|
-
logger.info(f"Processing {len(df)} rows for LLM summarization")
|
|
71
|
-
|
|
72
|
-
# Initialize OpenAI client with error handling
|
|
73
|
-
try:
|
|
74
|
-
client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
|
|
75
|
-
except Exception as e:
|
|
76
|
-
logger.error(f"Failed to initialize OpenAI client: {e}")
|
|
77
|
-
return control_message
|
|
78
|
-
|
|
79
|
-
# Stats for reporting
|
|
80
|
-
stats = {"processed": 0, "summarized": 0, "skipped": 0, "failed": 0}
|
|
81
|
-
|
|
82
|
-
# Process each row
|
|
83
|
-
for idx, row in df.iterrows():
|
|
84
|
-
stats["processed"] += 1
|
|
85
|
-
|
|
86
|
-
try:
|
|
87
|
-
# Extract content - be more flexible about where it comes from
|
|
88
|
-
content = _extract_content(row, logger)
|
|
89
|
-
|
|
90
|
-
if not content:
|
|
91
|
-
stats["skipped"] += 1
|
|
92
|
-
continue
|
|
93
|
-
|
|
94
|
-
content = content.strip()
|
|
95
|
-
if len(content) < min_content_length:
|
|
96
|
-
stats["skipped"] += 1
|
|
97
|
-
continue
|
|
98
|
-
|
|
99
|
-
# Truncate if needed
|
|
100
|
-
if len(content) > max_content_length:
|
|
101
|
-
content = content[:max_content_length]
|
|
102
|
-
|
|
103
|
-
# Generate summary
|
|
104
|
-
summary = _generate_summary(client, content, model_name, logger)
|
|
105
|
-
|
|
106
|
-
if summary:
|
|
107
|
-
# Add to metadata
|
|
108
|
-
_add_summary(df, idx, row, summary, model_name, logger)
|
|
109
|
-
stats["summarized"] += 1
|
|
110
|
-
else:
|
|
111
|
-
stats["failed"] += 1
|
|
112
|
-
|
|
113
|
-
except Exception as e:
|
|
114
|
-
stats["failed"] += 1
|
|
115
|
-
logger.error(f"Row {idx}: Error processing content: {e}")
|
|
116
|
-
|
|
117
|
-
# Update the control message with modified DataFrame
|
|
118
|
-
control_message.payload(df)
|
|
119
|
-
|
|
120
|
-
logger.info(
|
|
121
|
-
f"LLM summarization complete: {stats['summarized']}/{stats['processed']} documents summarized, "
|
|
122
|
-
f"{stats['skipped']} skipped, {stats['failed']} failed"
|
|
123
|
-
)
|
|
124
|
-
|
|
125
|
-
return control_message
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
def _extract_content(row, logger) -> Optional[str]:
|
|
129
|
-
"""Extract text content from row, trying multiple locations."""
|
|
130
|
-
content = ""
|
|
131
|
-
|
|
132
|
-
# Try different locations for content
|
|
133
|
-
if isinstance(row.get("metadata"), dict):
|
|
134
|
-
metadata = row["metadata"]
|
|
135
|
-
|
|
136
|
-
# Primary location: metadata.content
|
|
137
|
-
content = metadata.get("content", "")
|
|
138
|
-
|
|
139
|
-
# If no content, try other locations
|
|
140
|
-
if not content:
|
|
141
|
-
# Try in text_metadata
|
|
142
|
-
text_metadata = metadata.get("text_metadata", {})
|
|
143
|
-
content = text_metadata.get("text", "") or text_metadata.get("content", "")
|
|
144
|
-
|
|
145
|
-
# Try top-level content field
|
|
146
|
-
if not content:
|
|
147
|
-
content = row.get("content", "")
|
|
148
|
-
|
|
149
|
-
if not content:
|
|
150
|
-
return None
|
|
151
|
-
|
|
152
|
-
return content
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
def _generate_summary(client, content: str, model_name: str, logger) -> Optional[str]:
|
|
156
|
-
"""Generate summary with robust error handling."""
|
|
157
|
-
prompt = f"""Please provide a comprehensive 3-4 sentence summary of the following document:
|
|
158
|
-
|
|
159
|
-
{content}
|
|
160
|
-
|
|
161
|
-
Focus on the main purpose, key topics, and important details.
|
|
162
|
-
This summary will be used for document search and understanding.
|
|
163
|
-
|
|
164
|
-
Summary:"""
|
|
165
|
-
|
|
166
|
-
try:
|
|
167
|
-
completion = client.chat.completions.create(
|
|
168
|
-
model=model_name,
|
|
169
|
-
messages=[{"role": "user", "content": prompt}],
|
|
170
|
-
max_tokens=400, # Increased for more comprehensive summaries
|
|
171
|
-
temperature=0.7,
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
if completion.choices and len(completion.choices) > 0:
|
|
175
|
-
summary = completion.choices[0].message.content.strip()
|
|
176
|
-
return summary
|
|
177
|
-
else:
|
|
178
|
-
return None
|
|
179
|
-
|
|
180
|
-
except Exception as e:
|
|
181
|
-
logger.error(f"API call failed: {e}")
|
|
182
|
-
return None
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
def _add_summary(df, idx: int, row, summary: str, model_name: str, logger):
|
|
186
|
-
"""Add summary to metadata with safe handling."""
|
|
187
|
-
try:
|
|
188
|
-
# Get current metadata or create new dict - handle None case properly
|
|
189
|
-
existing_metadata = row.get("metadata")
|
|
190
|
-
if existing_metadata is not None and isinstance(existing_metadata, dict):
|
|
191
|
-
metadata = dict(existing_metadata) # Create a copy
|
|
192
|
-
else:
|
|
193
|
-
metadata = {}
|
|
194
|
-
|
|
195
|
-
# Ensure custom_content exists
|
|
196
|
-
if "custom_content" not in metadata or metadata["custom_content"] is None:
|
|
197
|
-
metadata["custom_content"] = {}
|
|
198
|
-
|
|
199
|
-
# Add LLM summary
|
|
200
|
-
metadata["custom_content"]["llm_summary"] = {"summary": summary, "model": model_name}
|
|
201
|
-
|
|
202
|
-
# Update the DataFrame at the specific index
|
|
203
|
-
try:
|
|
204
|
-
df.at[idx, "metadata"] = metadata
|
|
205
|
-
except Exception:
|
|
206
|
-
# Alternative approach: update the original row reference
|
|
207
|
-
df.iloc[idx]["metadata"] = metadata
|
|
208
|
-
|
|
209
|
-
except Exception as e:
|
|
210
|
-
logger.error(f"Failed to add summary to row {idx}: {e}")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|