nv-ingest-api 2025.4.18.dev20250418__tar.gz → 2025.4.19.dev20250419__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- {nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api.egg-info → nv_ingest_api-2025.4.19.dev20250419}/PKG-INFO +1 -1
- {nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal → nv_ingest_api-2025.4.19.dev20250419/src/nv_ingest_api}/primitives/control_message_task.py +0 -4
- {nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal → nv_ingest_api-2025.4.19.dev20250419/src/nv_ingest_api}/primitives/ingest_control_message.py +2 -5
- {nv_ingest_api-2025.4.18.dev20250418 → nv_ingest_api-2025.4.19.dev20250419/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
- nv_ingest_api-2025.4.19.dev20250419/src/nv_ingest_api.egg-info/SOURCES.txt +14 -0
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/interface/__init__.py +0 -215
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/interface/extract.py +0 -972
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/interface/mutate.py +0 -154
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/interface/store.py +0 -218
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/interface/transform.py +0 -382
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/interface/utility.py +0 -200
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/enums/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/enums/common.py +0 -494
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -5
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/image/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/mutate/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -110
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/mutate/filter.py +0 -133
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/store/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -236
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/store/image_upload.py +0 -232
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/transform/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/transform/caption_image.py +0 -205
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/transform/embed_text.py +0 -496
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/transform/split_text.py +0 -157
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/control_message/validators.py +0 -47
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/converters/bytetools.py +0 -78
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/converters/containers.py +0 -65
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/converters/datetools.py +0 -90
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/converters/dftools.py +0 -127
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/converters/formats.py +0 -64
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/converters/type_mappings.py +0 -27
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/detectors/__init__.py +0 -5
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/detectors/language.py +0 -38
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/exception_handlers/converters.py +0 -72
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -223
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -74
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -116
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -68
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/image_processing/__init__.py +0 -5
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/image_processing/clustering.py +0 -260
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/image_processing/processing.py +0 -179
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/image_processing/transforms.py +0 -407
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/logging/configuration.py +0 -31
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/message_brokers/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -451
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/metadata/__init__.py +0 -5
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/metadata/aggregators.py +0 -469
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/multi_processing/__init__.py +0 -8
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/nim/__init__.py +0 -56
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/pdf/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/pdf/pdfium.py +0 -427
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/schema/schema_validator.py +0 -10
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/service_clients/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/service_clients/client_base.py +0 -86
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -823
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/service_clients/rest/rest_client.py +0 -531
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/util/string_processing/__init__.py +0 -51
- nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api.egg-info/SOURCES.txt +0 -157
- {nv_ingest_api-2025.4.18.dev20250418 → nv_ingest_api-2025.4.19.dev20250419}/LICENSE +0 -0
- {nv_ingest_api-2025.4.18.dev20250418 → nv_ingest_api-2025.4.19.dev20250419}/MANIFEST.in +0 -0
- {nv_ingest_api-2025.4.18.dev20250418 → nv_ingest_api-2025.4.19.dev20250419}/README.md +0 -0
- {nv_ingest_api-2025.4.18.dev20250418 → nv_ingest_api-2025.4.19.dev20250419}/pyproject.toml +0 -0
- {nv_ingest_api-2025.4.18.dev20250418 → nv_ingest_api-2025.4.19.dev20250419}/setup.cfg +0 -0
- {nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal → nv_ingest_api-2025.4.19.dev20250419/src/nv_ingest_api}/__init__.py +0 -0
- {nv_ingest_api-2025.4.18.dev20250418/src/nv_ingest_api/internal/extract/docx/engines → nv_ingest_api-2025.4.19.dev20250419/src/nv_ingest_api/primitives}/__init__.py +0 -0
- {nv_ingest_api-2025.4.18.dev20250418 → nv_ingest_api-2025.4.19.dev20250419}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
- {nv_ingest_api-2025.4.18.dev20250418 → nv_ingest_api-2025.4.19.dev20250419}/src/nv_ingest_api.egg-info/requires.txt +0 -0
- {nv_ingest_api-2025.4.18.dev20250418 → nv_ingest_api-2025.4.19.dev20250419}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
- {nv_ingest_api-2025.4.18.dev20250418 → nv_ingest_api-2025.4.19.dev20250419}/src/version.py +0 -0
|
@@ -1,7 +1,3 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
1
|
import copy
|
|
6
2
|
import re
|
|
7
3
|
from datetime import datetime
|
|
@@ -10,7 +6,8 @@ import logging
|
|
|
10
6
|
import pandas as pd
|
|
11
7
|
from typing import Any, Dict, Generator, Union
|
|
12
8
|
|
|
13
|
-
from nv_ingest_api.
|
|
9
|
+
from nv_ingest_api.primitives.control_message_task import ControlMessageTask
|
|
10
|
+
|
|
14
11
|
|
|
15
12
|
logger = logging.getLogger(__name__)
|
|
16
13
|
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
src/version.py
|
|
6
|
+
src/nv_ingest_api/__init__.py
|
|
7
|
+
src/nv_ingest_api.egg-info/PKG-INFO
|
|
8
|
+
src/nv_ingest_api.egg-info/SOURCES.txt
|
|
9
|
+
src/nv_ingest_api.egg-info/dependency_links.txt
|
|
10
|
+
src/nv_ingest_api.egg-info/requires.txt
|
|
11
|
+
src/nv_ingest_api.egg-info/top_level.txt
|
|
12
|
+
src/nv_ingest_api/primitives/__init__.py
|
|
13
|
+
src/nv_ingest_api/primitives/control_message_task.py
|
|
14
|
+
src/nv_ingest_api/primitives/ingest_control_message.py
|
|
@@ -1,215 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
import logging
|
|
6
|
-
import functools
|
|
7
|
-
import inspect
|
|
8
|
-
import pprint
|
|
9
|
-
from typing import Dict, Any, Optional, List
|
|
10
|
-
|
|
11
|
-
from pydantic import BaseModel
|
|
12
|
-
|
|
13
|
-
from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFiumConfigSchema, NemoRetrieverParseConfigSchema
|
|
14
|
-
|
|
15
|
-
logger = logging.getLogger(__name__)
|
|
16
|
-
|
|
17
|
-
## CONFIG_SCHEMAS is a global dictionary that maps extraction methods to Pydantic schemas.
|
|
18
|
-
CONFIG_SCHEMAS: Dict[str, Any] = {
|
|
19
|
-
"adobe": PDFiumConfigSchema,
|
|
20
|
-
"llama": PDFiumConfigSchema,
|
|
21
|
-
"nemoretriever_parse": NemoRetrieverParseConfigSchema,
|
|
22
|
-
"pdfium": PDFiumConfigSchema,
|
|
23
|
-
"tika": PDFiumConfigSchema,
|
|
24
|
-
"unstructured_io": PDFiumConfigSchema,
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def _build_config_from_schema(schema_class: type[BaseModel], args: Dict[str, Any]) -> Dict[str, Any]:
|
|
29
|
-
"""
|
|
30
|
-
Build and validate a configuration dictionary from the provided arguments using a Pydantic schema.
|
|
31
|
-
|
|
32
|
-
This function filters the supplied arguments to include only those keys defined in the given
|
|
33
|
-
Pydantic schema (using Pydantic v2's `model_fields`), instantiates the schema for validation,
|
|
34
|
-
and returns the validated configuration as a dictionary.
|
|
35
|
-
|
|
36
|
-
Parameters
|
|
37
|
-
----------
|
|
38
|
-
schema_class : type[BaseModel]
|
|
39
|
-
The Pydantic BaseModel subclass used for validating the configuration.
|
|
40
|
-
args : dict
|
|
41
|
-
A dictionary of arguments from which to extract and validate configuration data.
|
|
42
|
-
|
|
43
|
-
Returns
|
|
44
|
-
-------
|
|
45
|
-
dict
|
|
46
|
-
A dictionary containing the validated configuration data as defined by the schema.
|
|
47
|
-
|
|
48
|
-
Raises
|
|
49
|
-
------
|
|
50
|
-
pydantic.ValidationError
|
|
51
|
-
If the provided arguments do not conform to the schema.
|
|
52
|
-
"""
|
|
53
|
-
field_names = schema_class.model_fields.keys()
|
|
54
|
-
config_data = {k: v for k, v in args.items() if k in field_names}
|
|
55
|
-
# Instantiate the schema to perform validation, then return the model's dictionary representation.
|
|
56
|
-
|
|
57
|
-
return schema_class(**config_data).dict()
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def extraction_interface_relay_constructor(api_fn, task_keys: Optional[List[str]] = None):
|
|
61
|
-
"""
|
|
62
|
-
Decorator for constructing and validating configuration using Pydantic schemas.
|
|
63
|
-
|
|
64
|
-
This decorator wraps a user-facing interface function. It extracts common task parameters
|
|
65
|
-
(using the provided task_keys, or defaults if not specified) and method-specific configuration
|
|
66
|
-
parameters based on a required 'extract_method' keyword argument. It then uses the corresponding
|
|
67
|
-
Pydantic schema (from the global CONFIG_SCHEMAS registry) to validate and build a method-specific
|
|
68
|
-
configuration. The resulting composite configuration, along with the extraction ledger and
|
|
69
|
-
execution trace log, is passed to the backend API function.
|
|
70
|
-
|
|
71
|
-
Parameters
|
|
72
|
-
----------
|
|
73
|
-
api_fn : callable
|
|
74
|
-
The backend API function that will be called with the extraction ledger, the task configuration
|
|
75
|
-
dictionary, the extractor configuration, and the execution trace log. This function must conform
|
|
76
|
-
to the signature:
|
|
77
|
-
|
|
78
|
-
extract_primitives_from_pdf_internal(df_extraction_ledger: pd.DataFrame,
|
|
79
|
-
task_config: Dict[str, Any],
|
|
80
|
-
extractor_config: Any,
|
|
81
|
-
execution_trace_log: Optional[List[Any]] = None)
|
|
82
|
-
task_keys : list of str, optional
|
|
83
|
-
A list of keyword names that should be extracted from the user function as common task parameters.
|
|
84
|
-
If not provided, defaults to ["extract_text", "extract_images", "extract_tables", "extract_charts"].
|
|
85
|
-
|
|
86
|
-
Returns
|
|
87
|
-
-------
|
|
88
|
-
callable
|
|
89
|
-
A wrapped function that builds and validates the configuration before invoking the backend API function.
|
|
90
|
-
|
|
91
|
-
Raises
|
|
92
|
-
------
|
|
93
|
-
ValueError
|
|
94
|
-
If the extraction method specified is not supported (i.e., no corresponding Pydantic schema exists
|
|
95
|
-
in CONFIG_SCHEMAS), if api_fn does not conform to the expected signature, or if the required
|
|
96
|
-
'extract_method' parameter is not provided.
|
|
97
|
-
"""
|
|
98
|
-
# Verify that api_fn conforms to the expected signature.
|
|
99
|
-
try:
|
|
100
|
-
# Try binding four arguments: ledger, task_config, extractor_config, and execution_trace_log.
|
|
101
|
-
inspect.signature(api_fn).bind("dummy_ledger", {"dummy": True}, {"dummy": True}, {})
|
|
102
|
-
except TypeError as e:
|
|
103
|
-
raise ValueError(
|
|
104
|
-
"api_fn must conform to the signature: "
|
|
105
|
-
"extract_primitives_from_pdf(df_extraction_ledger, task_config, extractor_config, execution_trace_log)"
|
|
106
|
-
) from e
|
|
107
|
-
|
|
108
|
-
if task_keys is None:
|
|
109
|
-
task_keys = []
|
|
110
|
-
|
|
111
|
-
def decorator(user_fn):
|
|
112
|
-
@functools.wraps(user_fn)
|
|
113
|
-
def wrapper(*args, **kwargs):
|
|
114
|
-
# Use bind_partial so that missing required arguments can be handled gracefully.
|
|
115
|
-
sig = inspect.signature(user_fn)
|
|
116
|
-
bound = sig.bind_partial(*args, **kwargs)
|
|
117
|
-
bound.apply_defaults()
|
|
118
|
-
|
|
119
|
-
# The first parameter is assumed to be the extraction ledger.
|
|
120
|
-
param_names = list(sig.parameters.keys())
|
|
121
|
-
if param_names[0] not in bound.arguments:
|
|
122
|
-
raise ValueError("Missing required ledger argument.")
|
|
123
|
-
ledger = bound.arguments[param_names[0]]
|
|
124
|
-
|
|
125
|
-
# Process reserved 'execution_trace_log'.
|
|
126
|
-
execution_trace_log = bound.arguments.get("execution_trace_log", None)
|
|
127
|
-
if execution_trace_log is None:
|
|
128
|
-
execution_trace_log = {} # Replace None with an empty dict.
|
|
129
|
-
if "execution_trace_log" in bound.arguments:
|
|
130
|
-
del bound.arguments["execution_trace_log"]
|
|
131
|
-
|
|
132
|
-
# Ensure that 'extract_method' is provided.
|
|
133
|
-
if "extract_method" not in bound.arguments or bound.arguments["extract_method"] is None:
|
|
134
|
-
raise ValueError("The 'extract_method' parameter is required.")
|
|
135
|
-
extract_method = bound.arguments["extract_method"]
|
|
136
|
-
del bound.arguments["extract_method"]
|
|
137
|
-
|
|
138
|
-
# Extract common task parameters using the specified task_keys.
|
|
139
|
-
task_params = {key: bound.arguments[key] for key in task_keys if key in bound.arguments}
|
|
140
|
-
task_params["extract_method"] = extract_method
|
|
141
|
-
task_config = {"params": task_params}
|
|
142
|
-
|
|
143
|
-
# Look up the appropriate Pydantic schema.
|
|
144
|
-
schema_class = CONFIG_SCHEMAS.get(extract_method)
|
|
145
|
-
if schema_class is None:
|
|
146
|
-
raise ValueError(f"Unsupported extraction method: {extract_method}")
|
|
147
|
-
|
|
148
|
-
# Build the method-specific configuration using the schema class.
|
|
149
|
-
extraction_config_dict = _build_config_from_schema(schema_class, bound.arguments)
|
|
150
|
-
|
|
151
|
-
# Create a Pydantic object instead of a dictionary for the specific extractor config
|
|
152
|
-
extractor_schema = None
|
|
153
|
-
try:
|
|
154
|
-
# Find the appropriate extractor schema class based on the extraction method
|
|
155
|
-
extractor_schema_name = f"{extract_method.capitalize()}ExtractorSchema"
|
|
156
|
-
extractor_schema_class = globals().get(extractor_schema_name)
|
|
157
|
-
|
|
158
|
-
if extractor_schema_class is None:
|
|
159
|
-
# Try another common naming pattern
|
|
160
|
-
extractor_schema_name = f"{extract_method.upper()}ExtractorSchema"
|
|
161
|
-
extractor_schema_class = globals().get(extractor_schema_name)
|
|
162
|
-
|
|
163
|
-
if extractor_schema_class is None:
|
|
164
|
-
# Final fallback attempt with camelCase
|
|
165
|
-
extractor_schema_name = f"{extract_method[0].upper() + extract_method[1:]}ExtractorSchema"
|
|
166
|
-
extractor_schema_class = globals().get(extractor_schema_name)
|
|
167
|
-
|
|
168
|
-
if extractor_schema_class is not None:
|
|
169
|
-
# Create the extractor schema with the method-specific config
|
|
170
|
-
config_key = f"{extract_method}_config"
|
|
171
|
-
extractor_schema = extractor_schema_class(**{config_key: extraction_config_dict})
|
|
172
|
-
else:
|
|
173
|
-
logger.warning(f"Could not find extractor schema class for method: {extract_method}")
|
|
174
|
-
except Exception as e:
|
|
175
|
-
logger.warning(f"Error creating extractor schema: {str(e)}")
|
|
176
|
-
# Fall back to dictionary approach if schema creation fails
|
|
177
|
-
extractor_schema = {f"{extract_method}_config": extraction_config_dict}
|
|
178
|
-
|
|
179
|
-
# If schema creation failed, fall back to dictionary
|
|
180
|
-
if extractor_schema is None:
|
|
181
|
-
extractor_schema = {f"{extract_method}_config": extraction_config_dict}
|
|
182
|
-
|
|
183
|
-
# Log the task and extractor configurations for debugging
|
|
184
|
-
logger.debug("\n" + "=" * 80)
|
|
185
|
-
logger.debug(f"DEBUG - API Function: {api_fn.__name__}")
|
|
186
|
-
logger.debug(f"DEBUG - Extract Method: {extract_method}")
|
|
187
|
-
logger.debug("-" * 80)
|
|
188
|
-
|
|
189
|
-
# Format the task config as a string and log it
|
|
190
|
-
task_config_str = pprint.pformat(task_config, width=100, sort_dicts=False)
|
|
191
|
-
logger.debug(f"DEBUG - Task Config:\n{task_config_str}")
|
|
192
|
-
logger.debug("-" * 80)
|
|
193
|
-
|
|
194
|
-
# Format the extractor config as a string and log it
|
|
195
|
-
if hasattr(extractor_schema, "model_dump"):
|
|
196
|
-
extractor_config_str = pprint.pformat(extractor_schema.model_dump(), width=100, sort_dicts=False)
|
|
197
|
-
else:
|
|
198
|
-
extractor_config_str = pprint.pformat(extractor_schema, width=100, sort_dicts=False)
|
|
199
|
-
logger.debug(f"DEBUG - Extractor Config Type: {type(extractor_schema)}")
|
|
200
|
-
logger.debug(f"DEBUG - Extractor Config:\n{extractor_config_str}")
|
|
201
|
-
logger.debug("=" * 80 + "\n")
|
|
202
|
-
|
|
203
|
-
# Call the backend API function.
|
|
204
|
-
pprint.pprint(task_config)
|
|
205
|
-
pprint.pprint(extractor_schema)
|
|
206
|
-
result = api_fn(ledger, task_config, extractor_schema, execution_trace_log)
|
|
207
|
-
|
|
208
|
-
# If the result is a tuple, return only the first element
|
|
209
|
-
if isinstance(result, tuple):
|
|
210
|
-
return result[0]
|
|
211
|
-
return result
|
|
212
|
-
|
|
213
|
-
return wrapper
|
|
214
|
-
|
|
215
|
-
return decorator
|