nv-ingest-api 2025.8.13.dev20250813__tar.gz → 2025.8.15.dev20250815__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api.egg-info → nv_ingest_api-2025.8.15.dev20250815}/PKG-INFO +1 -1
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/enums/common.py +37 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/image_extractor.py +5 -1
- nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/meta/udf.py +232 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +63 -22
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +102 -15
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +40 -4
- nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/transform/embed_text.py +5 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/exception_handlers/decorators.py +104 -156
- nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/imports/callable_signatures.py +108 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/imports/dynamic_resolvers.py +53 -5
- nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/string_processing/yaml.py +45 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/system/hardware_info.py +178 -13
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api.egg-info/SOURCES.txt +8 -0
- nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/imports/callable_signatures.py +0 -50
- nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/logging/configuration.py +0 -38
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/LICENSE +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/MANIFEST.in +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/README.md +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/pyproject.toml +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/setup.cfg +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/interface/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/interface/extract.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/interface/mutate.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/interface/store.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/interface/transform.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/interface/utility.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/html/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/html/html_extractor.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/mutate → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/meta}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/primitives/nim/model_interface → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/mutate}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/schemas → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/primitives/nim/model_interface}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/schemas/extract → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/schemas/meta → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas/extract}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_html_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/schemas/mutate → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas/meta}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/schemas/store → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas/mutate}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/schemas/transform → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas/store}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/store → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/schemas/transform}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/internal/transform → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/store}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/internal/transform}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/internal/transform/split_text.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/imports → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/control_message/validators.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/containers.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/datetools.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/dftools.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/formats.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/detectors/language.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/message_brokers → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/imports}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/schema → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/introspection}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/logging/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/service_clients → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/message_brokers}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/nim/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813/src/nv_ingest_api/util/service_clients/redis → nv_ingest_api-2025.8.15.dev20250815/src/nv_ingest_api/util/schema}/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api/util/system/__init__.py +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api.egg-info/requires.txt +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
- {nv_ingest_api-2025.8.13.dev20250813 → nv_ingest_api-2025.8.15.dev20250815}/src/version.py +0 -0
|
@@ -386,6 +386,40 @@ class StatusEnum(str, Enum):
|
|
|
386
386
|
SUCCESS: str = "success"
|
|
387
387
|
|
|
388
388
|
|
|
389
|
+
class PipelinePhase(int, Enum):
|
|
390
|
+
"""
|
|
391
|
+
The logical phase of a pipeline stage.
|
|
392
|
+
|
|
393
|
+
Attributes
|
|
394
|
+
----------
|
|
395
|
+
PRE_PROCESSING : int
|
|
396
|
+
Pre-processing phase.
|
|
397
|
+
EXTRACTION : int
|
|
398
|
+
Extraction phase.
|
|
399
|
+
POST_PROCESSING : int
|
|
400
|
+
Post-processing phase.
|
|
401
|
+
MUTATION : int
|
|
402
|
+
Mutation phase.
|
|
403
|
+
TRANSFORM : int
|
|
404
|
+
Transform phase.
|
|
405
|
+
RESPONSE : int
|
|
406
|
+
Response phase.
|
|
407
|
+
TELEMETRY : int
|
|
408
|
+
Telemetry phase.
|
|
409
|
+
DRAIN : int
|
|
410
|
+
Drain phase.
|
|
411
|
+
"""
|
|
412
|
+
|
|
413
|
+
PRE_PROCESSING = 0
|
|
414
|
+
EXTRACTION = 1
|
|
415
|
+
POST_PROCESSING = 2
|
|
416
|
+
MUTATION = 3
|
|
417
|
+
TRANSFORM = 4
|
|
418
|
+
RESPONSE = 5
|
|
419
|
+
TELEMETRY = 6
|
|
420
|
+
DRAIN = 7
|
|
421
|
+
|
|
422
|
+
|
|
389
423
|
class TableFormatEnum(str, Enum):
|
|
390
424
|
"""
|
|
391
425
|
Enum for representing table formats.
|
|
@@ -446,6 +480,8 @@ class TaskTypeEnum(str, Enum):
|
|
|
446
480
|
Represents a task for extracting chart data.
|
|
447
481
|
INFOGRAPHIC_DATA_EXTRACT : str
|
|
448
482
|
Represents a task for extracting infographic data.
|
|
483
|
+
UDF : str
|
|
484
|
+
Represents a user-defined function task.
|
|
449
485
|
"""
|
|
450
486
|
|
|
451
487
|
AUDIO_DATA_EXTRACT: str = "audio_data_extract"
|
|
@@ -460,6 +496,7 @@ class TaskTypeEnum(str, Enum):
|
|
|
460
496
|
STORE_EMBEDDING: str = "store_embedding"
|
|
461
497
|
STORE: str = "store"
|
|
462
498
|
TABLE_DATA_EXTRACT: str = "table_data_extract"
|
|
499
|
+
UDF: str = "udf"
|
|
463
500
|
VDB_UPLOAD: str = "vdb_upload"
|
|
464
501
|
|
|
465
502
|
|
|
@@ -108,8 +108,12 @@ def _decode_and_extract_from_image(
|
|
|
108
108
|
f"decode_and_extract: Extracting image content using image_extraction_config: "
|
|
109
109
|
f"{validated_extraction_config}"
|
|
110
110
|
)
|
|
111
|
+
# Ensure we pass the correct nested config type (ImageConfigSchema) to helpers.
|
|
112
|
+
# Some callers provide the full ImageExtractorSchema; extract its inner image_extraction_config.
|
|
111
113
|
if validated_extraction_config is not None:
|
|
112
|
-
|
|
114
|
+
inner_cfg = getattr(validated_extraction_config, "image_extraction_config", validated_extraction_config)
|
|
115
|
+
if inner_cfg is not None:
|
|
116
|
+
extract_params["image_extraction_config"] = inner_cfg
|
|
113
117
|
|
|
114
118
|
if execution_trace_log is not None:
|
|
115
119
|
extract_params["trace_info"] = execution_trace_log
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import inspect
|
|
7
|
+
import logging
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_all_tasks_by_type
|
|
13
|
+
from nv_ingest_api.internal.schemas.meta.udf import UDFStageSchema
|
|
14
|
+
from nv_ingest_api.util.imports.callable_signatures import ingest_callable_signature
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class CachedUDF:
|
|
21
|
+
"""Cached UDF function with metadata"""
|
|
22
|
+
|
|
23
|
+
function: callable
|
|
24
|
+
function_name: str
|
|
25
|
+
signature_validated: bool
|
|
26
|
+
created_at: float
|
|
27
|
+
last_used: float
|
|
28
|
+
use_count: int
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class UDFCache:
|
|
32
|
+
"""LRU cache for compiled and validated UDF functions"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, max_size: int = 128, ttl_seconds: Optional[int] = 3600):
|
|
35
|
+
self.max_size = max_size
|
|
36
|
+
self.ttl_seconds = ttl_seconds
|
|
37
|
+
self.cache: Dict[str, CachedUDF] = {}
|
|
38
|
+
self.access_order: List[str] = [] # For LRU tracking
|
|
39
|
+
|
|
40
|
+
def _generate_cache_key(self, udf_function_str: str, udf_function_name: str) -> str:
|
|
41
|
+
"""Generate cache key from UDF string and function name"""
|
|
42
|
+
content = f"{udf_function_str.strip()}:{udf_function_name}"
|
|
43
|
+
return hashlib.sha256(content.encode()).hexdigest()
|
|
44
|
+
|
|
45
|
+
def _evict_lru(self):
|
|
46
|
+
"""Remove least recently used item"""
|
|
47
|
+
if self.access_order:
|
|
48
|
+
lru_key = self.access_order.pop(0)
|
|
49
|
+
self.cache.pop(lru_key, None)
|
|
50
|
+
|
|
51
|
+
def _cleanup_expired(self):
|
|
52
|
+
"""Remove expired entries if TTL is configured"""
|
|
53
|
+
if not self.ttl_seconds:
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
current_time = time.time()
|
|
57
|
+
expired_keys = [
|
|
58
|
+
key for key, cached_udf in self.cache.items() if current_time - cached_udf.created_at > self.ttl_seconds
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
for key in expired_keys:
|
|
62
|
+
self.cache.pop(key, None)
|
|
63
|
+
if key in self.access_order:
|
|
64
|
+
self.access_order.remove(key)
|
|
65
|
+
|
|
66
|
+
def get(self, udf_function_str: str, udf_function_name: str) -> Optional[CachedUDF]:
|
|
67
|
+
"""Get cached UDF function if available"""
|
|
68
|
+
self._cleanup_expired()
|
|
69
|
+
|
|
70
|
+
cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
|
|
71
|
+
|
|
72
|
+
if cache_key in self.cache:
|
|
73
|
+
# Update access tracking
|
|
74
|
+
if cache_key in self.access_order:
|
|
75
|
+
self.access_order.remove(cache_key)
|
|
76
|
+
self.access_order.append(cache_key)
|
|
77
|
+
|
|
78
|
+
# Update usage stats
|
|
79
|
+
cached_udf = self.cache[cache_key]
|
|
80
|
+
cached_udf.last_used = time.time()
|
|
81
|
+
cached_udf.use_count += 1
|
|
82
|
+
|
|
83
|
+
return cached_udf
|
|
84
|
+
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
def put(
|
|
88
|
+
self, udf_function_str: str, udf_function_name: str, function: callable, signature_validated: bool = True
|
|
89
|
+
) -> str:
|
|
90
|
+
"""Cache a compiled and validated UDF function"""
|
|
91
|
+
cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
|
|
92
|
+
|
|
93
|
+
# Evict LRU if at capacity
|
|
94
|
+
while len(self.cache) >= self.max_size:
|
|
95
|
+
self._evict_lru()
|
|
96
|
+
|
|
97
|
+
current_time = time.time()
|
|
98
|
+
cached_udf = CachedUDF(
|
|
99
|
+
function=function,
|
|
100
|
+
function_name=udf_function_name,
|
|
101
|
+
signature_validated=signature_validated,
|
|
102
|
+
created_at=current_time,
|
|
103
|
+
last_used=current_time,
|
|
104
|
+
use_count=1,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
self.cache[cache_key] = cached_udf
|
|
108
|
+
self.access_order.append(cache_key)
|
|
109
|
+
|
|
110
|
+
return cache_key
|
|
111
|
+
|
|
112
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
113
|
+
"""Get cache statistics"""
|
|
114
|
+
total_uses = sum(udf.use_count for udf in self.cache.values())
|
|
115
|
+
most_used = max(self.cache.values(), key=lambda x: x.use_count, default=None)
|
|
116
|
+
return {
|
|
117
|
+
"size": len(self.cache),
|
|
118
|
+
"max_size": self.max_size,
|
|
119
|
+
"total_uses": total_uses,
|
|
120
|
+
"most_used_function": most_used.function_name if most_used else None,
|
|
121
|
+
"most_used_count": most_used.use_count if most_used else 0,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# Global cache instance
|
|
126
|
+
_udf_cache = UDFCache(max_size=128, ttl_seconds=3600)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def compile_and_validate_udf(udf_function_str: str, udf_function_name: str, task_num: int) -> callable:
|
|
130
|
+
"""Compile and validate UDF function (extracted for caching)"""
|
|
131
|
+
# Execute the UDF function string in a controlled namespace
|
|
132
|
+
namespace: Dict[str, Any] = {}
|
|
133
|
+
try:
|
|
134
|
+
exec(udf_function_str, namespace)
|
|
135
|
+
except Exception as e:
|
|
136
|
+
raise ValueError(f"UDF task {task_num} failed to execute: {str(e)}")
|
|
137
|
+
|
|
138
|
+
# Extract the specified function from the namespace
|
|
139
|
+
if udf_function_name in namespace and callable(namespace[udf_function_name]):
|
|
140
|
+
udf_function = namespace[udf_function_name]
|
|
141
|
+
else:
|
|
142
|
+
raise ValueError(f"UDF task {task_num}: Specified UDF function '{udf_function_name}' not found or not callable")
|
|
143
|
+
|
|
144
|
+
# Validate the UDF function signature
|
|
145
|
+
try:
|
|
146
|
+
ingest_callable_signature(inspect.signature(udf_function))
|
|
147
|
+
except Exception as e:
|
|
148
|
+
raise ValueError(f"UDF task {task_num} has invalid function signature: {str(e)}")
|
|
149
|
+
|
|
150
|
+
return udf_function
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def get_udf_cache_stats() -> Dict[str, Any]:
|
|
154
|
+
"""Get UDF cache performance statistics"""
|
|
155
|
+
return _udf_cache.get_stats()
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def udf_stage_callable_fn(control_message: IngestControlMessage, stage_config: UDFStageSchema) -> IngestControlMessage:
|
|
159
|
+
"""
|
|
160
|
+
UDF stage callable function that processes UDF tasks in a control message.
|
|
161
|
+
|
|
162
|
+
This function extracts all UDF tasks from the control message and executes them sequentially.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
control_message : IngestControlMessage
|
|
167
|
+
The control message containing UDF tasks to process
|
|
168
|
+
stage_config : UDFStageSchema
|
|
169
|
+
Configuration for the UDF stage
|
|
170
|
+
|
|
171
|
+
Returns
|
|
172
|
+
-------
|
|
173
|
+
IngestControlMessage
|
|
174
|
+
The control message after processing all UDF tasks
|
|
175
|
+
"""
|
|
176
|
+
logger.debug("Starting UDF stage processing")
|
|
177
|
+
|
|
178
|
+
# Extract all UDF tasks from control message using free function
|
|
179
|
+
try:
|
|
180
|
+
all_task_configs = remove_all_tasks_by_type(control_message, "udf")
|
|
181
|
+
except ValueError:
|
|
182
|
+
# No UDF tasks found
|
|
183
|
+
if stage_config.ignore_empty_udf:
|
|
184
|
+
logger.debug("No UDF tasks found, ignoring as configured")
|
|
185
|
+
return control_message
|
|
186
|
+
else:
|
|
187
|
+
raise ValueError("No UDF tasks found in control message")
|
|
188
|
+
|
|
189
|
+
# Process each UDF task sequentially
|
|
190
|
+
for task_num, task_config in enumerate(all_task_configs, 1):
|
|
191
|
+
logger.debug(f"Processing UDF task {task_num} of {len(all_task_configs)}")
|
|
192
|
+
|
|
193
|
+
# Get UDF function string and function name from task properties
|
|
194
|
+
udf_function_str = task_config.get("udf_function", "").strip()
|
|
195
|
+
udf_function_name = task_config.get("udf_function_name", "").strip()
|
|
196
|
+
|
|
197
|
+
# Skip empty UDF functions if configured to ignore them
|
|
198
|
+
if not udf_function_str:
|
|
199
|
+
if stage_config.ignore_empty_udf:
|
|
200
|
+
logger.debug(f"UDF task {task_num} has empty function, skipping as configured")
|
|
201
|
+
continue
|
|
202
|
+
else:
|
|
203
|
+
raise ValueError(f"UDF task {task_num} has empty function string")
|
|
204
|
+
|
|
205
|
+
# Validate that function name is provided
|
|
206
|
+
if not udf_function_name:
|
|
207
|
+
raise ValueError(f"UDF task {task_num} missing required 'udf_function_name' property")
|
|
208
|
+
|
|
209
|
+
# Check if UDF function is cached
|
|
210
|
+
cached_udf = _udf_cache.get(udf_function_str, udf_function_name)
|
|
211
|
+
if cached_udf:
|
|
212
|
+
udf_function = cached_udf.function
|
|
213
|
+
else:
|
|
214
|
+
# Compile and validate UDF function
|
|
215
|
+
udf_function = compile_and_validate_udf(udf_function_str, udf_function_name, task_num)
|
|
216
|
+
# Cache the compiled UDF function
|
|
217
|
+
_udf_cache.put(udf_function_str, udf_function_name, udf_function)
|
|
218
|
+
|
|
219
|
+
# Execute the UDF function with the control message
|
|
220
|
+
try:
|
|
221
|
+
control_message = udf_function(control_message)
|
|
222
|
+
except Exception as e:
|
|
223
|
+
raise ValueError(f"UDF task {task_num} execution failed: {str(e)}")
|
|
224
|
+
|
|
225
|
+
# Validate that the UDF function returned an IngestControlMessage
|
|
226
|
+
if not isinstance(control_message, IngestControlMessage):
|
|
227
|
+
raise ValueError(f"UDF task {task_num} must return an IngestControlMessage, got {type(control_message)}")
|
|
228
|
+
|
|
229
|
+
logger.debug(f"UDF task {task_num} completed successfully")
|
|
230
|
+
|
|
231
|
+
logger.debug(f"UDF stage processing completed. Processed {len(all_task_configs)} UDF tasks")
|
|
232
|
+
return control_message
|
|
@@ -5,10 +5,11 @@
|
|
|
5
5
|
import copy
|
|
6
6
|
import re
|
|
7
7
|
from datetime import datetime
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from typing import Any, Dict, Generator, List, Optional, Union
|
|
8
10
|
|
|
9
11
|
import logging
|
|
10
12
|
import pandas as pd
|
|
11
|
-
from typing import Any, Dict, Generator, Union
|
|
12
13
|
|
|
13
14
|
from nv_ingest_api.internal.primitives.control_message_task import ControlMessageTask
|
|
14
15
|
|
|
@@ -55,6 +56,52 @@ def remove_task_by_type(ctrl_msg, task: str):
|
|
|
55
56
|
return removed_task.properties
|
|
56
57
|
|
|
57
58
|
|
|
59
|
+
def remove_all_tasks_by_type(ctrl_msg, task: str):
|
|
60
|
+
"""
|
|
61
|
+
Remove all tasks from the control message by matching their type.
|
|
62
|
+
|
|
63
|
+
This function iterates over the tasks in the control message, finds all tasks
|
|
64
|
+
whose type matches the provided task string, removes them, and returns their
|
|
65
|
+
properties as a list.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
ctrl_msg : IngestControlMessage
|
|
70
|
+
The control message from which to remove the tasks.
|
|
71
|
+
task : str
|
|
72
|
+
The task type to remove.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
list[dict]
|
|
77
|
+
A list of dictionaries of properties for all removed tasks.
|
|
78
|
+
|
|
79
|
+
Raises
|
|
80
|
+
------
|
|
81
|
+
ValueError
|
|
82
|
+
If no tasks with the given type are found.
|
|
83
|
+
"""
|
|
84
|
+
matching_tasks = []
|
|
85
|
+
|
|
86
|
+
# Find all tasks with matching type
|
|
87
|
+
for t in ctrl_msg.get_tasks():
|
|
88
|
+
if t.type == task:
|
|
89
|
+
matching_tasks.append(t)
|
|
90
|
+
|
|
91
|
+
if not matching_tasks:
|
|
92
|
+
err_msg = f"process_control_message: No tasks of type '{task}' found in control message."
|
|
93
|
+
logger.error(err_msg)
|
|
94
|
+
raise ValueError(err_msg)
|
|
95
|
+
|
|
96
|
+
# Remove all matching tasks and collect their properties
|
|
97
|
+
removed_task_properties = []
|
|
98
|
+
for task_obj in matching_tasks:
|
|
99
|
+
removed_task = ctrl_msg.remove_task(task_obj.id)
|
|
100
|
+
removed_task_properties.append(removed_task.properties)
|
|
101
|
+
|
|
102
|
+
return removed_task_properties
|
|
103
|
+
|
|
104
|
+
|
|
58
105
|
class IngestControlMessage:
|
|
59
106
|
"""
|
|
60
107
|
A control message class for ingesting tasks and managing associated metadata,
|
|
@@ -65,47 +112,41 @@ class IngestControlMessage:
|
|
|
65
112
|
"""
|
|
66
113
|
Initialize a new IngestControlMessage instance.
|
|
67
114
|
"""
|
|
68
|
-
self._tasks: Dict[str, ControlMessageTask] =
|
|
115
|
+
self._tasks: Dict[str, List[ControlMessageTask]] = defaultdict(list)
|
|
69
116
|
self._metadata: Dict[str, Any] = {}
|
|
70
117
|
self._timestamps: Dict[str, datetime] = {}
|
|
71
|
-
self._payload: pd.DataFrame =
|
|
118
|
+
self._payload: Optional[pd.DataFrame] = None
|
|
72
119
|
self._config: Dict[str, Any] = {}
|
|
73
120
|
|
|
74
121
|
def add_task(self, task: ControlMessageTask):
|
|
75
122
|
"""
|
|
76
|
-
Add a task to the control message
|
|
77
|
-
|
|
78
|
-
Raises
|
|
79
|
-
------
|
|
80
|
-
ValueError
|
|
81
|
-
If a task with the same 'id' already exists.
|
|
123
|
+
Add a task to the control message. Multiple tasks with the same ID are supported.
|
|
82
124
|
"""
|
|
83
|
-
|
|
84
|
-
raise ValueError(f"Task with id '{task.id}' already exists. Tasks must be unique.")
|
|
85
|
-
self._tasks[task.id] = task
|
|
125
|
+
self._tasks[task.id].append(task)
|
|
86
126
|
|
|
87
127
|
def get_tasks(self) -> Generator[ControlMessageTask, None, None]:
|
|
88
128
|
"""
|
|
89
129
|
Return all tasks as a generator.
|
|
90
130
|
"""
|
|
91
|
-
|
|
131
|
+
for task_list in self._tasks.values():
|
|
132
|
+
yield from task_list
|
|
92
133
|
|
|
93
134
|
def has_task(self, task_id: str) -> bool:
|
|
94
135
|
"""
|
|
95
|
-
Check if
|
|
136
|
+
Check if any tasks with the given ID exist.
|
|
96
137
|
"""
|
|
97
|
-
return task_id in self._tasks
|
|
138
|
+
return task_id in self._tasks and len(self._tasks[task_id]) > 0
|
|
98
139
|
|
|
99
140
|
def remove_task(self, task_id: str) -> ControlMessageTask:
|
|
100
141
|
"""
|
|
101
|
-
Remove
|
|
142
|
+
Remove the first task with the given ID. Warns if no task exists.
|
|
102
143
|
"""
|
|
103
|
-
if task_id in self._tasks:
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
return
|
|
144
|
+
if task_id in self._tasks and self._tasks[task_id]:
|
|
145
|
+
task = self._tasks[task_id].pop(0)
|
|
146
|
+
# Clean up empty lists
|
|
147
|
+
if not self._tasks[task_id]:
|
|
148
|
+
del self._tasks[task_id]
|
|
149
|
+
return task
|
|
109
150
|
else:
|
|
110
151
|
raise RuntimeError(f"Attempted to remove non-existent task with id: {task_id}")
|
|
111
152
|
|
|
@@ -5,24 +5,30 @@
|
|
|
5
5
|
|
|
6
6
|
import functools
|
|
7
7
|
import inspect
|
|
8
|
+
import logging
|
|
8
9
|
import string
|
|
9
10
|
from datetime import datetime
|
|
11
|
+
from typing import Optional
|
|
10
12
|
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
11
14
|
|
|
12
|
-
|
|
15
|
+
|
|
16
|
+
def traceable(trace_name: Optional[str] = None):
|
|
13
17
|
"""
|
|
14
18
|
A decorator that adds entry and exit trace timestamps to a IngestControlMessage's metadata
|
|
15
19
|
based on the presence of a 'config::add_trace_tagging' flag.
|
|
16
20
|
|
|
17
21
|
This decorator checks if the 'config::add_trace_tagging' flag is set to True in the
|
|
18
22
|
message's metadata. If so, it records the entry and exit timestamps of the function
|
|
19
|
-
execution, using either a provided custom trace name
|
|
23
|
+
execution, using either a provided custom trace name, auto-detected stage name from
|
|
24
|
+
self.stage_name, or the function's name as fallback.
|
|
20
25
|
|
|
21
26
|
Parameters
|
|
22
27
|
----------
|
|
23
28
|
trace_name : str, optional
|
|
24
|
-
A custom name for the trace entries in the message metadata. If not provided,
|
|
25
|
-
|
|
29
|
+
A custom name for the trace entries in the message metadata. If not provided,
|
|
30
|
+
attempts to use self.stage_name from the decorated method's instance,
|
|
31
|
+
falling back to the function's name if neither is available.
|
|
26
32
|
|
|
27
33
|
Returns
|
|
28
34
|
-------
|
|
@@ -41,26 +47,48 @@ def traceable(trace_name=None):
|
|
|
41
47
|
- 'trace::entry::<trace_name>': The timestamp marking the function's entry.
|
|
42
48
|
- 'trace::exit::<trace_name>': The timestamp marking the function's exit.
|
|
43
49
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
50
|
+
Examples
|
|
51
|
+
--------
|
|
52
|
+
Automatic stage name detection (recommended):
|
|
47
53
|
|
|
48
|
-
>>> @traceable()
|
|
49
|
-
... def process_message(message):
|
|
54
|
+
>>> @traceable() # Uses self.stage_name automatically
|
|
55
|
+
... def process_message(self, message):
|
|
50
56
|
... pass
|
|
51
57
|
|
|
52
|
-
|
|
58
|
+
Explicit trace name (override):
|
|
53
59
|
|
|
54
|
-
>>>
|
|
55
|
-
...
|
|
56
|
-
...
|
|
57
|
-
|
|
60
|
+
>>> @traceable("custom_trace")
|
|
61
|
+
... def process_message(self, message):
|
|
62
|
+
... pass
|
|
63
|
+
|
|
64
|
+
Function without instance (uses function name):
|
|
65
|
+
|
|
66
|
+
>>> @traceable()
|
|
67
|
+
... def process_message(message):
|
|
68
|
+
... pass
|
|
58
69
|
"""
|
|
59
70
|
|
|
60
71
|
def decorator_trace_tagging(func):
|
|
61
72
|
@functools.wraps(func)
|
|
62
73
|
def wrapper_trace_tagging(*args, **kwargs):
|
|
63
74
|
ts_fetched = datetime.now()
|
|
75
|
+
|
|
76
|
+
# Determine the trace name to use
|
|
77
|
+
resolved_trace_name = trace_name
|
|
78
|
+
|
|
79
|
+
# If no explicit trace_name provided, try to get it from self.stage_name
|
|
80
|
+
if resolved_trace_name is None and len(args) >= 1:
|
|
81
|
+
stage_instance = args[0] # 'self' in method calls
|
|
82
|
+
if hasattr(stage_instance, "stage_name") and stage_instance.stage_name:
|
|
83
|
+
resolved_trace_name = stage_instance.stage_name
|
|
84
|
+
logger.debug(f"Using auto-detected trace name: '{resolved_trace_name}'")
|
|
85
|
+
else:
|
|
86
|
+
resolved_trace_name = func.__name__
|
|
87
|
+
logger.debug(f"Using function name as trace name: '{resolved_trace_name}'")
|
|
88
|
+
elif resolved_trace_name is None:
|
|
89
|
+
resolved_trace_name = func.__name__
|
|
90
|
+
logger.debug(f"Using function name as trace name: '{resolved_trace_name}'")
|
|
91
|
+
|
|
64
92
|
# Determine which argument is the message.
|
|
65
93
|
if hasattr(args[0], "has_metadata"):
|
|
66
94
|
message = args[0]
|
|
@@ -73,7 +101,7 @@ def traceable(trace_name=None):
|
|
|
73
101
|
message.get_metadata("config::add_trace_tagging") is True
|
|
74
102
|
)
|
|
75
103
|
|
|
76
|
-
trace_prefix =
|
|
104
|
+
trace_prefix = resolved_trace_name
|
|
77
105
|
|
|
78
106
|
if do_trace_tagging:
|
|
79
107
|
ts_send = message.get_timestamp("latency::ts_send")
|
|
@@ -199,3 +227,62 @@ def traceable_func(trace_name=None, dedupe=True):
|
|
|
199
227
|
return wrapper_inject_trace_info
|
|
200
228
|
|
|
201
229
|
return decorator_inject_trace_info
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def set_trace_timestamps_with_parent_context(control_message, execution_trace_log: dict, parent_name: str, logger=None):
|
|
233
|
+
"""
|
|
234
|
+
Set trace timestamps on a control message with proper parent-child context.
|
|
235
|
+
|
|
236
|
+
This utility function processes trace timestamps from an execution_trace_log and
|
|
237
|
+
ensures that child traces are properly namespaced under their parent context.
|
|
238
|
+
This resolves OpenTelemetry span hierarchy issues where child spans cannot
|
|
239
|
+
find their expected parent contexts.
|
|
240
|
+
|
|
241
|
+
Parameters
|
|
242
|
+
----------
|
|
243
|
+
control_message : IngestControlMessage
|
|
244
|
+
The control message to set timestamps on
|
|
245
|
+
execution_trace_log : dict
|
|
246
|
+
Dictionary of trace keys to timestamp values from internal operations
|
|
247
|
+
parent_name : str
|
|
248
|
+
The parent stage name to use as context for child traces
|
|
249
|
+
logger : logging.Logger, optional
|
|
250
|
+
Logger for debug output of key transformations
|
|
251
|
+
|
|
252
|
+
Examples
|
|
253
|
+
--------
|
|
254
|
+
Basic usage in a stage:
|
|
255
|
+
|
|
256
|
+
>>> execution_trace_log = {"trace::entry::yolox_inference": ts1, "trace::exit::yolox_inference": ts2}
|
|
257
|
+
>>> set_trace_timestamps_with_parent_context(
|
|
258
|
+
... control_message, execution_trace_log, "pdf_extractor", logger
|
|
259
|
+
... )
|
|
260
|
+
|
|
261
|
+
This transforms:
|
|
262
|
+
- trace::entry::yolox_inference -> trace::entry::pdf_extractor::yolox_inference
|
|
263
|
+
- trace::exit::yolox_inference -> trace::exit::pdf_extractor::yolox_inference
|
|
264
|
+
"""
|
|
265
|
+
if not execution_trace_log:
|
|
266
|
+
return
|
|
267
|
+
|
|
268
|
+
for key, ts in execution_trace_log.items():
|
|
269
|
+
enhanced_key = key
|
|
270
|
+
|
|
271
|
+
# Check if this is a child trace that needs parent context
|
|
272
|
+
if key.startswith("trace::") and "::" in key:
|
|
273
|
+
# Parse the trace key to extract the base trace name
|
|
274
|
+
parts = key.split("::")
|
|
275
|
+
if len(parts) >= 3: # e.g., ["trace", "entry", "yolox_inference"]
|
|
276
|
+
trace_type = parts[1] # "entry" or "exit"
|
|
277
|
+
child_name = "::".join(parts[2:]) # everything after trace::entry:: or trace::exit::
|
|
278
|
+
|
|
279
|
+
# Only rewrite if it doesn't already include the parent context
|
|
280
|
+
if not child_name.startswith(f"{parent_name}::"):
|
|
281
|
+
# Rewrite to include parent context: trace::entry::pdf_extractor::yolox_inference
|
|
282
|
+
enhanced_key = f"trace::{trace_type}::{parent_name}::{child_name}"
|
|
283
|
+
|
|
284
|
+
if logger:
|
|
285
|
+
logger.debug(f"Enhanced trace key: {key} -> {enhanced_key}")
|
|
286
|
+
|
|
287
|
+
# Set the timestamp with the (possibly enhanced) key
|
|
288
|
+
control_message.set_timestamp(enhanced_key, ts)
|