nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
- nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from pydantic import ConfigDict, BaseModel
|
|
9
|
+
from pydantic import StrictBool
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ImageDedupSchema(BaseModel):
|
|
15
|
+
raise_on_failure: StrictBool = False
|
|
16
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
|
|
6
|
+
#
|
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
8
|
+
# you may not use this file except in compliance with the License.
|
|
9
|
+
# You may obtain a copy of the License at
|
|
10
|
+
#
|
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
#
|
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
16
|
+
# See the License for the specific language governing permissions and
|
|
17
|
+
# limitations under the License.
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
|
|
21
|
+
from pydantic import ConfigDict, BaseModel
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class EmbeddingStorageSchema(BaseModel):
|
|
27
|
+
raise_on_failure: bool = False
|
|
28
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
|
|
6
|
+
#
|
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
8
|
+
# you may not use this file except in compliance with the License.
|
|
9
|
+
# You may obtain a copy of the License at
|
|
10
|
+
#
|
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
#
|
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
16
|
+
# See the License for the specific language governing permissions and
|
|
17
|
+
# limitations under the License.
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
|
|
21
|
+
from pydantic import ConfigDict, BaseModel
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ImageStorageModuleSchema(BaseModel):
|
|
27
|
+
structured: bool = True
|
|
28
|
+
images: bool = True
|
|
29
|
+
raise_on_failure: bool = False
|
|
30
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from pydantic import ConfigDict, BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ImageCaptionExtractionSchema(BaseModel):
|
|
10
|
+
api_key: str = "api_key"
|
|
11
|
+
endpoint_url: str = "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions"
|
|
12
|
+
prompt: str = "Caption the content of this image:"
|
|
13
|
+
model_name: str = "meta/llama-3.2-11b-vision-instruct"
|
|
14
|
+
raise_on_failure: bool = False
|
|
15
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from pydantic import ConfigDict, BaseModel
|
|
9
|
+
from pydantic import StrictBool
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ImageFilterSchema(BaseModel):
|
|
15
|
+
raise_on_failure: StrictBool = False
|
|
16
|
+
cpu_only: StrictBool = False
|
|
17
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
from pydantic import ConfigDict, BaseModel
|
|
9
|
+
|
|
10
|
+
from nv_ingest_api.util.logging.configuration import LogLevel
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TextEmbeddingSchema(BaseModel):
|
|
16
|
+
api_key: str = "api_key"
|
|
17
|
+
batch_size: int = 4
|
|
18
|
+
embedding_model: str = "nvidia/nv-embedqa-e5-v5"
|
|
19
|
+
embedding_nim_endpoint: str = "http://embedding:8000/v1"
|
|
20
|
+
encoding_format: str = "float"
|
|
21
|
+
httpx_log_level: LogLevel = LogLevel.WARNING
|
|
22
|
+
input_type: str = "passage"
|
|
23
|
+
raise_on_failure: bool = False
|
|
24
|
+
truncate: str = "END"
|
|
25
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, BaseModel, field_validator
|
|
6
|
+
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from typing_extensions import Annotated
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TextSplitterSchema(BaseModel):
|
|
13
|
+
tokenizer: Optional[str] = None
|
|
14
|
+
chunk_size: Annotated[int, Field(gt=0)] = 1024
|
|
15
|
+
chunk_overlap: Annotated[int, Field(ge=0)] = 150
|
|
16
|
+
raise_on_failure: bool = False
|
|
17
|
+
|
|
18
|
+
@field_validator("chunk_overlap")
|
|
19
|
+
def check_chunk_overlap(cls, v, values, **kwargs):
|
|
20
|
+
if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]:
|
|
21
|
+
raise ValueError("chunk_overlap must be less than chunk_size")
|
|
22
|
+
return v
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from typing import Any, Union, Optional
|
|
8
|
+
from typing import Dict
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from minio import Minio
|
|
12
|
+
from pymilvus import Collection
|
|
13
|
+
from pymilvus import connections
|
|
14
|
+
from pymilvus.bulk_writer.constants import BulkFileType
|
|
15
|
+
from pymilvus.bulk_writer.remote_bulk_writer import RemoteBulkWriter
|
|
16
|
+
from pydantic import BaseModel
|
|
17
|
+
|
|
18
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
19
|
+
from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
_DEFAULT_ENDPOINT = os.environ.get("MINIO_INTERNAL_ADDRESS", "minio:9000")
|
|
24
|
+
_DEFAULT_BUCKET_NAME = os.environ.get("MINIO_BUCKET", "nv-ingest")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _upload_text_embeddings(df_store_ledger: pd.DataFrame, task_config: Dict[str, Any]) -> pd.DataFrame:
|
|
28
|
+
"""
|
|
29
|
+
Uploads embeddings to MinIO for contents (e.g., images) contained in a DataFrame.
|
|
30
|
+
The image metadata in the "metadata" column is updated with the URL (or path) of the uploaded data.
|
|
31
|
+
|
|
32
|
+
This function performs the following steps:
|
|
33
|
+
1. Initializes a MinIO client using the provided task configuration parameters.
|
|
34
|
+
2. Connects to a Milvus instance and retrieves the collection schema.
|
|
35
|
+
3. Ensures that the target bucket exists (creating it if necessary).
|
|
36
|
+
4. Configures a RemoteBulkWriter to upload embedding data in PARQUET format.
|
|
37
|
+
5. Iterates over each row in the DataFrame, updates the metadata with the bucket path, and appends
|
|
38
|
+
rows to the writer if an embedding is present.
|
|
39
|
+
6. Commits the writer, finalizing the upload process.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
df_store_ledger : pd.DataFrame
|
|
44
|
+
DataFrame containing the data to upload. Each row is expected to have:
|
|
45
|
+
- A "metadata" column (a dictionary) that includes keys such as "content", "embedding",
|
|
46
|
+
"source_metadata", and "content_metadata".
|
|
47
|
+
- A "document_type" column indicating the type of document (e.g., IMAGE, STRUCTURED).
|
|
48
|
+
task_config : Dict[str, Any]
|
|
49
|
+
Dictionary of parameters for the upload. Expected keys include:
|
|
50
|
+
- "minio_access_key": Optional[str]
|
|
51
|
+
Access key for MinIO.
|
|
52
|
+
- "minio_secret_key": Optional[str]
|
|
53
|
+
Secret key for MinIO.
|
|
54
|
+
- "minio_endpoint": str, default _DEFAULT_ENDPOINT
|
|
55
|
+
MinIO endpoint URL.
|
|
56
|
+
- "minio_bucket_name": str, default _DEFAULT_BUCKET_NAME
|
|
57
|
+
Name of the bucket in MinIO.
|
|
58
|
+
- "minio_bucket_path": str, default "embeddings"
|
|
59
|
+
Path within the bucket where embeddings are stored.
|
|
60
|
+
- "minio_session_token": Optional[str]
|
|
61
|
+
(Optional) Session token for MinIO.
|
|
62
|
+
- "minio_secure": bool, default False
|
|
63
|
+
Whether to use a secure connection to MinIO.
|
|
64
|
+
- "minio_region": Optional[str]
|
|
65
|
+
(Optional) Region for the MinIO service.
|
|
66
|
+
- "milvus_address": str, default "milvus"
|
|
67
|
+
Address of the Milvus service.
|
|
68
|
+
- "milvus_uri": str, default "http://milvus:19530"
|
|
69
|
+
URI for Milvus.
|
|
70
|
+
- "milvus_host": str, default "milvus"
|
|
71
|
+
Host for Milvus.
|
|
72
|
+
- "milvus_port": int, default 19530
|
|
73
|
+
Port for Milvus.
|
|
74
|
+
- "collection_name": str, default "nv_ingest_collection"
|
|
75
|
+
Name of the Milvus collection from which to retrieve the schema.
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
pd.DataFrame
|
|
80
|
+
The input DataFrame with updated "metadata" columns containing the uploaded embedding URL
|
|
81
|
+
(or bucket path).
|
|
82
|
+
|
|
83
|
+
Raises
|
|
84
|
+
------
|
|
85
|
+
Exception
|
|
86
|
+
Propagates any exception encountered during the upload process, wrapping it with additional context.
|
|
87
|
+
"""
|
|
88
|
+
try:
|
|
89
|
+
# Retrieve connection parameters for MinIO
|
|
90
|
+
minio_access_key: Optional[str] = task_config.get("minio_access_key")
|
|
91
|
+
minio_secret_key: Optional[str] = task_config.get("minio_secret_key")
|
|
92
|
+
minio_endpoint: str = task_config.get("minio_endpoint", _DEFAULT_ENDPOINT)
|
|
93
|
+
minio_bucket_name: str = task_config.get("minio_bucket_name", _DEFAULT_BUCKET_NAME)
|
|
94
|
+
minio_bucket_path: str = task_config.get("minio_bucket_path", "embeddings")
|
|
95
|
+
|
|
96
|
+
# Retrieve connection parameters for Milvus
|
|
97
|
+
milvus_address: str = task_config.get("milvus_address", "milvus")
|
|
98
|
+
milvus_uri: str = task_config.get("milvus_uri", "http://milvus:19530")
|
|
99
|
+
milvus_host: str = task_config.get("milvus_host", "milvus")
|
|
100
|
+
milvus_port: int = task_config.get("milvus_port", 19530)
|
|
101
|
+
milvus_collection_name: str = task_config.get("collection_name", "nv_ingest_collection")
|
|
102
|
+
|
|
103
|
+
# Initialize MinIO client
|
|
104
|
+
client = Minio(
|
|
105
|
+
minio_endpoint,
|
|
106
|
+
access_key=minio_access_key,
|
|
107
|
+
secret_key=minio_secret_key,
|
|
108
|
+
session_token=task_config.get("minio_session_token"),
|
|
109
|
+
secure=task_config.get("minio_secure", False),
|
|
110
|
+
region=task_config.get("minio_region"),
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Connect to Milvus and retrieve collection schema
|
|
114
|
+
connections.connect(
|
|
115
|
+
address=milvus_address,
|
|
116
|
+
uri=f"{milvus_uri}:{milvus_port}",
|
|
117
|
+
host=milvus_host,
|
|
118
|
+
port=milvus_port,
|
|
119
|
+
)
|
|
120
|
+
schema = Collection(milvus_collection_name).schema
|
|
121
|
+
|
|
122
|
+
# Ensure bucket exists
|
|
123
|
+
if not client.bucket_exists(minio_bucket_name):
|
|
124
|
+
client.make_bucket(minio_bucket_name)
|
|
125
|
+
logger.debug("Created bucket %s", minio_bucket_name)
|
|
126
|
+
else:
|
|
127
|
+
logger.debug("Bucket %s already exists", minio_bucket_name)
|
|
128
|
+
|
|
129
|
+
# Setup connection parameters for RemoteBulkWriter
|
|
130
|
+
conn = RemoteBulkWriter.ConnectParam(
|
|
131
|
+
endpoint=minio_endpoint,
|
|
132
|
+
access_key=minio_access_key,
|
|
133
|
+
secret_key=minio_secret_key,
|
|
134
|
+
bucket_name=minio_bucket_name,
|
|
135
|
+
secure=False,
|
|
136
|
+
)
|
|
137
|
+
writer = RemoteBulkWriter(
|
|
138
|
+
schema=schema,
|
|
139
|
+
remote_path=minio_bucket_path,
|
|
140
|
+
connect_param=conn,
|
|
141
|
+
file_type=BulkFileType.PARQUET,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Process each row in the DataFrame
|
|
145
|
+
for idx, row in df_store_ledger.iterrows():
|
|
146
|
+
metadata: Dict[str, Any] = row["metadata"].copy()
|
|
147
|
+
# Update embedding metadata with the bucket path
|
|
148
|
+
metadata["embedding_metadata"] = {"uploaded_embedding_url": minio_bucket_path}
|
|
149
|
+
|
|
150
|
+
doc_type = row["document_type"]
|
|
151
|
+
content_replace: bool = doc_type in [ContentTypeEnum.IMAGE, ContentTypeEnum.STRUCTURED]
|
|
152
|
+
location: str = metadata["source_metadata"]["source_location"]
|
|
153
|
+
content = metadata["content"]
|
|
154
|
+
|
|
155
|
+
# If an embedding exists, update metadata and append the row for upload
|
|
156
|
+
if metadata.get("embedding") is not None:
|
|
157
|
+
logger.error(f"row type: {doc_type} - {location} - {len(content)}")
|
|
158
|
+
df_store_ledger.at[idx, "metadata"] = metadata
|
|
159
|
+
|
|
160
|
+
writer.append_row(
|
|
161
|
+
{
|
|
162
|
+
"text": location if content_replace else content,
|
|
163
|
+
"source": metadata["source_metadata"],
|
|
164
|
+
"content_metadata": metadata["content_metadata"],
|
|
165
|
+
"vector": metadata["embedding"],
|
|
166
|
+
}
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
writer.commit()
|
|
170
|
+
return df_store_ledger
|
|
171
|
+
|
|
172
|
+
except Exception as e:
|
|
173
|
+
err_msg = f"upload_embeddings: Error uploading embeddings. Original error: {e}"
|
|
174
|
+
logger.error(err_msg, exc_info=True)
|
|
175
|
+
raise type(e)(err_msg) from e
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def store_text_embeddings_internal(
|
|
179
|
+
df_store_ledger: pd.DataFrame,
|
|
180
|
+
task_config: Union[BaseModel, Dict[str, Any]],
|
|
181
|
+
store_config: EmbeddingStorageSchema,
|
|
182
|
+
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
183
|
+
) -> pd.DataFrame:
|
|
184
|
+
"""
|
|
185
|
+
Stores embeddings by uploading content from a DataFrame to MinIO.
|
|
186
|
+
|
|
187
|
+
This function prepares the necessary parameters for the upload based on the task configuration,
|
|
188
|
+
invokes the upload routine, and returns the updated DataFrame.
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
df_store_ledger : pd.DataFrame
|
|
193
|
+
DataFrame containing the data whose embeddings need to be stored.
|
|
194
|
+
task_config : Union[BaseModel, Dict[str, Any]]
|
|
195
|
+
Task configuration. If it is a Pydantic model, it will be converted to a dictionary.
|
|
196
|
+
store_config : Dict[str, Any]
|
|
197
|
+
Configuration parameters for storage (not directly used in the current implementation).
|
|
198
|
+
execution_trace_log : Optional[Dict[str, Any]], default=None
|
|
199
|
+
Optional dictionary for trace logging information.
|
|
200
|
+
|
|
201
|
+
Returns
|
|
202
|
+
-------
|
|
203
|
+
pd.DataFrame
|
|
204
|
+
The updated DataFrame after embeddings have been uploaded and metadata updated.
|
|
205
|
+
|
|
206
|
+
Raises
|
|
207
|
+
------
|
|
208
|
+
Exception
|
|
209
|
+
If any error occurs during the storage process, it is logged and re-raised with additional context.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
_ = store_config # Unused
|
|
213
|
+
_ = execution_trace_log # Unused
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
# Convert Pydantic model to dict if necessary
|
|
217
|
+
if isinstance(task_config, BaseModel):
|
|
218
|
+
task_config = task_config.model_dump()
|
|
219
|
+
|
|
220
|
+
# Set content types for embeddings and update params
|
|
221
|
+
content_types = {ContentTypeEnum.EMBEDDING: True}
|
|
222
|
+
params: Dict[str, Any] = task_config.get("params", {})
|
|
223
|
+
params["content_types"] = content_types
|
|
224
|
+
|
|
225
|
+
# Perform the upload of embeddings
|
|
226
|
+
df_store_ledger = _upload_text_embeddings(df_store_ledger, params)
|
|
227
|
+
|
|
228
|
+
result, execution_trace_log = df_store_ledger, {}
|
|
229
|
+
_ = execution_trace_log # Unused
|
|
230
|
+
|
|
231
|
+
return result
|
|
232
|
+
|
|
233
|
+
except Exception as e:
|
|
234
|
+
err_msg = f"_store_embeddings: Failed to store embeddings: {e}"
|
|
235
|
+
logger.error(err_msg, exc_info=True)
|
|
236
|
+
raise type(e)(err_msg) from e
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from typing import Any, List, Optional
|
|
10
|
+
from typing import Dict
|
|
11
|
+
from urllib.parse import quote
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from minio import Minio
|
|
15
|
+
|
|
16
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# TODO: Move these into microservice_entrypoint.py to populate the stage and validate them using the pydantic schema
|
|
21
|
+
# on startup.
|
|
22
|
+
_DEFAULT_ENDPOINT = os.environ.get("MINIO_INTERNAL_ADDRESS", "minio:9000")
|
|
23
|
+
_DEFAULT_READ_ADDRESS = os.environ.get("MINIO_PUBLIC_ADDRESS", "http://minio:9000")
|
|
24
|
+
_DEFAULT_BUCKET_NAME = os.environ.get("MINIO_BUCKET", "nv-ingest")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _ensure_bucket_exists(client: Minio, bucket_name: str) -> None:
|
|
28
|
+
"""
|
|
29
|
+
Ensure that the specified bucket exists in MinIO, and create it if it does not.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
client : Minio
|
|
34
|
+
An instance of the Minio client.
|
|
35
|
+
bucket_name : str
|
|
36
|
+
The name of the bucket to check or create.
|
|
37
|
+
"""
|
|
38
|
+
if not client.bucket_exists(bucket_name):
|
|
39
|
+
client.make_bucket(bucket_name)
|
|
40
|
+
logger.debug("Created bucket %s", bucket_name)
|
|
41
|
+
else:
|
|
42
|
+
logger.debug("Bucket %s already exists", bucket_name)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _upload_images_to_minio(df: pd.DataFrame, params: Dict[str, Any]) -> pd.DataFrame:
|
|
46
|
+
"""
|
|
47
|
+
Identifies content within a DataFrame and uploads it to MinIO, updating the metadata with the uploaded URL.
|
|
48
|
+
|
|
49
|
+
This function iterates over rows of the provided DataFrame. For rows whose "document_type" is listed
|
|
50
|
+
in the provided 'content_types' configuration, it decodes the base64-encoded content, uploads the object to
|
|
51
|
+
MinIO, and updates the metadata with the public URL. Errors during individual row processing are logged and
|
|
52
|
+
skipped, so the process continues for remaining rows.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
df : pd.DataFrame
|
|
57
|
+
The DataFrame containing rows with content and associated metadata.
|
|
58
|
+
params : Dict[str, Any]
|
|
59
|
+
A flat dictionary of configuration parameters for the upload. Expected keys include:
|
|
60
|
+
- "content_types": Dict mapping document types to booleans.
|
|
61
|
+
- "endpoint": URL for the MinIO service (optional; defaults to _DEFAULT_ENDPOINT).
|
|
62
|
+
- "bucket_name": Bucket name for storing objects (optional; defaults to _DEFAULT_BUCKET_NAME).
|
|
63
|
+
- "access_key": Access key for MinIO.
|
|
64
|
+
- "secret_key": Secret key for MinIO.
|
|
65
|
+
- "session_token": Session token for MinIO (optional).
|
|
66
|
+
- "secure": Boolean indicating if HTTPS should be used.
|
|
67
|
+
- "region": Region for the MinIO service (optional).
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
pd.DataFrame
|
|
72
|
+
The updated DataFrame with metadata reflecting the uploaded URLs. Rows that encountered errors
|
|
73
|
+
during processing will remain unchanged.
|
|
74
|
+
|
|
75
|
+
Raises
|
|
76
|
+
------
|
|
77
|
+
ValueError
|
|
78
|
+
If the required "content_types" key is missing or is not a dictionary.
|
|
79
|
+
Exception
|
|
80
|
+
Propagates any critical exceptions not handled at the row level.
|
|
81
|
+
"""
|
|
82
|
+
# Validate required configuration
|
|
83
|
+
content_types = params.get("content_types")
|
|
84
|
+
if not isinstance(content_types, dict):
|
|
85
|
+
raise ValueError("Invalid configuration: 'content_types' must be provided as a dictionary in params")
|
|
86
|
+
|
|
87
|
+
endpoint: str = params.get("endpoint", _DEFAULT_ENDPOINT)
|
|
88
|
+
bucket_name: str = params.get("bucket_name", _DEFAULT_BUCKET_NAME)
|
|
89
|
+
|
|
90
|
+
# Initialize MinIO client
|
|
91
|
+
client = Minio(
|
|
92
|
+
endpoint,
|
|
93
|
+
access_key=params.get("access_key"),
|
|
94
|
+
secret_key=params.get("secret_key"),
|
|
95
|
+
session_token=params.get("session_token"),
|
|
96
|
+
secure=params.get("secure", False),
|
|
97
|
+
region=params.get("region"),
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Ensure the bucket exists
|
|
101
|
+
_ensure_bucket_exists(client, bucket_name)
|
|
102
|
+
|
|
103
|
+
# Process each row and attempt to upload images
|
|
104
|
+
for idx, row in df.iterrows():
|
|
105
|
+
try:
|
|
106
|
+
doc_type = row.get("document_type")
|
|
107
|
+
if doc_type not in content_types:
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
metadata = row.get("metadata")
|
|
111
|
+
if not isinstance(metadata, dict):
|
|
112
|
+
logger.error("Row %s: 'metadata' is not a dictionary", idx)
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
# Validate required metadata fields
|
|
116
|
+
if "content" not in metadata:
|
|
117
|
+
logger.error("Row %s: missing 'content' in metadata", idx)
|
|
118
|
+
continue
|
|
119
|
+
if "source_metadata" not in metadata or not isinstance(metadata["source_metadata"], dict):
|
|
120
|
+
logger.error("Row %s: missing or invalid 'source_metadata' in metadata", idx)
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
source_metadata = metadata["source_metadata"]
|
|
124
|
+
if "source_id" not in source_metadata:
|
|
125
|
+
logger.error("Row %s: missing 'source_id' in source_metadata", idx)
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
# Decode the content from base64
|
|
129
|
+
content = base64.b64decode(metadata["content"].encode())
|
|
130
|
+
source_id = source_metadata["source_id"]
|
|
131
|
+
|
|
132
|
+
# Determine image type (default to 'png')
|
|
133
|
+
image_type = "png"
|
|
134
|
+
if doc_type == ContentTypeEnum.IMAGE:
|
|
135
|
+
image_metadata = metadata.get("image_metadata", {})
|
|
136
|
+
image_type = image_metadata.get("image_type", "png")
|
|
137
|
+
|
|
138
|
+
# Construct destination file path
|
|
139
|
+
encoded_source_id = quote(source_id, safe="")
|
|
140
|
+
encoded_image_type = quote(image_type, safe="")
|
|
141
|
+
destination_file = f"{encoded_source_id}/{idx}.{encoded_image_type}"
|
|
142
|
+
|
|
143
|
+
# Upload the object to MinIO
|
|
144
|
+
source_file = BytesIO(content)
|
|
145
|
+
client.put_object(
|
|
146
|
+
bucket_name,
|
|
147
|
+
destination_file,
|
|
148
|
+
source_file,
|
|
149
|
+
length=len(content),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Construct the public URL
|
|
153
|
+
public_url = f"{_DEFAULT_READ_ADDRESS}/{bucket_name}/{destination_file}"
|
|
154
|
+
source_metadata["source_location"] = public_url
|
|
155
|
+
|
|
156
|
+
if doc_type == ContentTypeEnum.IMAGE:
|
|
157
|
+
logger.debug("Storing image data to Minio for row %s", idx)
|
|
158
|
+
image_metadata = metadata.get("image_metadata", {})
|
|
159
|
+
image_metadata["uploaded_image_url"] = public_url
|
|
160
|
+
metadata["image_metadata"] = image_metadata
|
|
161
|
+
elif doc_type == ContentTypeEnum.STRUCTURED:
|
|
162
|
+
logger.debug("Storing structured image data to Minio for row %s", idx)
|
|
163
|
+
table_metadata = metadata.get("table_metadata", {})
|
|
164
|
+
table_metadata["uploaded_image_url"] = public_url
|
|
165
|
+
metadata["table_metadata"] = table_metadata
|
|
166
|
+
|
|
167
|
+
df.at[idx, "metadata"] = metadata
|
|
168
|
+
|
|
169
|
+
except Exception as e:
|
|
170
|
+
logger.exception("Failed to process row %s: %s", idx, e)
|
|
171
|
+
# Continue processing the remaining rows
|
|
172
|
+
|
|
173
|
+
return df
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def store_images_to_minio_internal(
|
|
177
|
+
df_storage_ledger: pd.DataFrame,
|
|
178
|
+
task_config: Dict[str, Any],
|
|
179
|
+
storage_config: Dict[str, Any],
|
|
180
|
+
execution_trace_log: Optional[List[Any]] = None,
|
|
181
|
+
) -> pd.DataFrame:
|
|
182
|
+
"""
|
|
183
|
+
Processes a storage ledger DataFrame to upload images (and structured content) to MinIO.
|
|
184
|
+
|
|
185
|
+
This function validates the input DataFrame and task configuration, then creates a mask to select rows
|
|
186
|
+
where the "document_type" is among the desired types specified in the configuration. If matching rows are
|
|
187
|
+
found, it calls the internal upload function to process and update the DataFrame; otherwise, it returns the
|
|
188
|
+
original DataFrame unmodified.
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
df_storage_ledger : pd.DataFrame
|
|
193
|
+
The DataFrame containing storage ledger information, which must include at least the columns
|
|
194
|
+
"document_type" and "metadata".
|
|
195
|
+
task_config : Dict[str, Any]
|
|
196
|
+
A flat dictionary containing configuration parameters for image storage. Expected to include the key
|
|
197
|
+
"content_types" (a dict mapping document types to booleans) along with connection and credential details.
|
|
198
|
+
storage_config : Dict[str, Any]
|
|
199
|
+
A dictionary reserved for additional storage configuration (currently unused).
|
|
200
|
+
execution_trace_log : Optional[List[Any]], optional
|
|
201
|
+
An optional list for capturing execution trace details (currently unused), by default None.
|
|
202
|
+
|
|
203
|
+
Returns
|
|
204
|
+
-------
|
|
205
|
+
pd.DataFrame
|
|
206
|
+
The updated DataFrame after attempting to upload images for rows with matching document types. Rows
|
|
207
|
+
that do not match remain unchanged.
|
|
208
|
+
|
|
209
|
+
Raises
|
|
210
|
+
------
|
|
211
|
+
ValueError
|
|
212
|
+
If the input DataFrame is missing required columns or if the task configuration is invalid.
|
|
213
|
+
"""
|
|
214
|
+
# Validate that required keys and columns exist
|
|
215
|
+
if "content_types" not in task_config or not isinstance(task_config["content_types"], dict):
|
|
216
|
+
raise ValueError("Task configuration must include a valid 'content_types' dictionary.")
|
|
217
|
+
|
|
218
|
+
if "document_type" not in df_storage_ledger.columns:
|
|
219
|
+
raise ValueError("Input DataFrame must contain a 'document_type' column.")
|
|
220
|
+
|
|
221
|
+
content_types = task_config["content_types"]
|
|
222
|
+
|
|
223
|
+
# Create a mask for rows where "document_type" is one of the desired types
|
|
224
|
+
storage_obj_mask = df_storage_ledger["document_type"].isin(list(content_types.keys()))
|
|
225
|
+
if (~storage_obj_mask).all():
|
|
226
|
+
logger.debug("No storage objects matching %s found in the DataFrame.", content_types)
|
|
227
|
+
return df_storage_ledger
|
|
228
|
+
|
|
229
|
+
result, execution_trace_log = _upload_images_to_minio(df_storage_ledger, task_config), {}
|
|
230
|
+
_ = execution_trace_log
|
|
231
|
+
|
|
232
|
+
return result
|