nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from pydantic import Field, BaseModel, field_validator, ConfigDict
|
|
6
|
+
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TextSplitterSchema(BaseModel):
|
|
11
|
+
tokenizer: Optional[str] = None
|
|
12
|
+
chunk_size: int = Field(default=1024, gt=0)
|
|
13
|
+
chunk_overlap: int = Field(default=150, ge=0)
|
|
14
|
+
raise_on_failure: bool = False
|
|
15
|
+
|
|
16
|
+
@field_validator("chunk_overlap")
|
|
17
|
+
@classmethod
|
|
18
|
+
def check_chunk_overlap(cls, v, values):
|
|
19
|
+
chunk_size = values.data.get("chunk_size")
|
|
20
|
+
if chunk_size is not None and v >= chunk_size:
|
|
21
|
+
raise ValueError("chunk_overlap must be less than chunk_size")
|
|
22
|
+
return v
|
|
23
|
+
|
|
24
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from typing import Any, Union, Optional
|
|
8
|
+
from typing import Dict
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from minio import Minio
|
|
12
|
+
from pymilvus import Collection
|
|
13
|
+
from pymilvus import connections
|
|
14
|
+
from pymilvus.bulk_writer.constants import BulkFileType
|
|
15
|
+
from pymilvus.bulk_writer.remote_bulk_writer import RemoteBulkWriter
|
|
16
|
+
from pydantic import BaseModel
|
|
17
|
+
|
|
18
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
19
|
+
from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
_DEFAULT_ENDPOINT = os.environ.get("MINIO_INTERNAL_ADDRESS", "minio:9000")
|
|
24
|
+
_DEFAULT_BUCKET_NAME = os.environ.get("MINIO_BUCKET", "nv-ingest")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _upload_text_embeddings(df_store_ledger: pd.DataFrame, task_config: Dict[str, Any]) -> pd.DataFrame:
|
|
28
|
+
"""
|
|
29
|
+
Uploads embeddings to MinIO for contents (e.g., images) contained in a DataFrame.
|
|
30
|
+
The image metadata in the "metadata" column is updated with the URL (or path) of the uploaded data.
|
|
31
|
+
|
|
32
|
+
This function performs the following steps:
|
|
33
|
+
1. Initializes a MinIO client using the provided task configuration parameters.
|
|
34
|
+
2. Connects to a Milvus instance and retrieves the collection schema.
|
|
35
|
+
3. Ensures that the target bucket exists (creating it if necessary).
|
|
36
|
+
4. Configures a RemoteBulkWriter to upload embedding data in PARQUET format.
|
|
37
|
+
5. Iterates over each row in the DataFrame, updates the metadata with the bucket path, and appends
|
|
38
|
+
rows to the writer if an embedding is present.
|
|
39
|
+
6. Commits the writer, finalizing the upload process.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
df_store_ledger : pd.DataFrame
|
|
44
|
+
DataFrame containing the data to upload. Each row is expected to have:
|
|
45
|
+
- A "metadata" column (a dictionary) that includes keys such as "content", "embedding",
|
|
46
|
+
"source_metadata", and "content_metadata".
|
|
47
|
+
- A "document_type" column indicating the type of document (e.g., IMAGE, STRUCTURED).
|
|
48
|
+
task_config : Dict[str, Any]
|
|
49
|
+
Dictionary of parameters for the upload. Expected keys include:
|
|
50
|
+
- "minio_access_key": Optional[str]
|
|
51
|
+
Access key for MinIO.
|
|
52
|
+
- "minio_secret_key": Optional[str]
|
|
53
|
+
Secret key for MinIO.
|
|
54
|
+
- "minio_endpoint": str, default _DEFAULT_ENDPOINT
|
|
55
|
+
MinIO endpoint URL.
|
|
56
|
+
- "minio_bucket_name": str, default _DEFAULT_BUCKET_NAME
|
|
57
|
+
Name of the bucket in MinIO.
|
|
58
|
+
- "minio_bucket_path": str, default "embeddings"
|
|
59
|
+
Path within the bucket where embeddings are stored.
|
|
60
|
+
- "minio_session_token": Optional[str]
|
|
61
|
+
(Optional) Session token for MinIO.
|
|
62
|
+
- "minio_secure": bool, default False
|
|
63
|
+
Whether to use a secure connection to MinIO.
|
|
64
|
+
- "minio_region": Optional[str]
|
|
65
|
+
(Optional) Region for the MinIO service.
|
|
66
|
+
- "milvus_address": str, default "milvus"
|
|
67
|
+
Address of the Milvus service.
|
|
68
|
+
- "milvus_uri": str, default "http://milvus:19530"
|
|
69
|
+
URI for Milvus.
|
|
70
|
+
- "milvus_host": str, default "milvus"
|
|
71
|
+
Host for Milvus.
|
|
72
|
+
- "milvus_port": int, default 19530
|
|
73
|
+
Port for Milvus.
|
|
74
|
+
- "collection_name": str, default "nv_ingest_collection"
|
|
75
|
+
Name of the Milvus collection from which to retrieve the schema.
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
pd.DataFrame
|
|
80
|
+
The input DataFrame with updated "metadata" columns containing the uploaded embedding URL
|
|
81
|
+
(or bucket path).
|
|
82
|
+
|
|
83
|
+
Raises
|
|
84
|
+
------
|
|
85
|
+
Exception
|
|
86
|
+
Propagates any exception encountered during the upload process, wrapping it with additional context.
|
|
87
|
+
"""
|
|
88
|
+
try:
|
|
89
|
+
# Retrieve connection parameters for MinIO
|
|
90
|
+
minio_access_key: Optional[str] = task_config.get("minio_access_key")
|
|
91
|
+
minio_secret_key: Optional[str] = task_config.get("minio_secret_key")
|
|
92
|
+
minio_endpoint: str = task_config.get("minio_endpoint", _DEFAULT_ENDPOINT)
|
|
93
|
+
minio_bucket_name: str = task_config.get("minio_bucket_name", _DEFAULT_BUCKET_NAME)
|
|
94
|
+
minio_bucket_path: str = task_config.get("minio_bucket_path", "embeddings")
|
|
95
|
+
|
|
96
|
+
# Retrieve connection parameters for Milvus
|
|
97
|
+
milvus_address: str = task_config.get("milvus_address", "milvus")
|
|
98
|
+
milvus_uri: str = task_config.get("milvus_uri", "http://milvus:19530")
|
|
99
|
+
milvus_host: str = task_config.get("milvus_host", "milvus")
|
|
100
|
+
milvus_port: int = task_config.get("milvus_port", 19530)
|
|
101
|
+
milvus_collection_name: str = task_config.get("collection_name", "nv_ingest_collection")
|
|
102
|
+
|
|
103
|
+
# Initialize MinIO client
|
|
104
|
+
client = Minio(
|
|
105
|
+
minio_endpoint,
|
|
106
|
+
access_key=minio_access_key,
|
|
107
|
+
secret_key=minio_secret_key,
|
|
108
|
+
session_token=task_config.get("minio_session_token"),
|
|
109
|
+
secure=task_config.get("minio_secure", False),
|
|
110
|
+
region=task_config.get("minio_region"),
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Connect to Milvus and retrieve collection schema
|
|
114
|
+
connections.connect(
|
|
115
|
+
address=milvus_address,
|
|
116
|
+
uri=f"{milvus_uri}:{milvus_port}",
|
|
117
|
+
host=milvus_host,
|
|
118
|
+
port=milvus_port,
|
|
119
|
+
)
|
|
120
|
+
schema = Collection(milvus_collection_name).schema
|
|
121
|
+
|
|
122
|
+
# Ensure bucket exists
|
|
123
|
+
if not client.bucket_exists(minio_bucket_name):
|
|
124
|
+
client.make_bucket(minio_bucket_name)
|
|
125
|
+
logger.debug("Created bucket %s", minio_bucket_name)
|
|
126
|
+
else:
|
|
127
|
+
logger.debug("Bucket %s already exists", minio_bucket_name)
|
|
128
|
+
|
|
129
|
+
# Setup connection parameters for RemoteBulkWriter
|
|
130
|
+
conn = RemoteBulkWriter.ConnectParam(
|
|
131
|
+
endpoint=minio_endpoint,
|
|
132
|
+
access_key=minio_access_key,
|
|
133
|
+
secret_key=minio_secret_key,
|
|
134
|
+
bucket_name=minio_bucket_name,
|
|
135
|
+
secure=False,
|
|
136
|
+
)
|
|
137
|
+
writer = RemoteBulkWriter(
|
|
138
|
+
schema=schema,
|
|
139
|
+
remote_path=minio_bucket_path,
|
|
140
|
+
connect_param=conn,
|
|
141
|
+
file_type=BulkFileType.PARQUET,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Process each row in the DataFrame
|
|
145
|
+
for idx, row in df_store_ledger.iterrows():
|
|
146
|
+
metadata: Dict[str, Any] = row["metadata"].copy()
|
|
147
|
+
# Update embedding metadata with the bucket path
|
|
148
|
+
metadata["embedding_metadata"] = {"uploaded_embedding_url": minio_bucket_path}
|
|
149
|
+
|
|
150
|
+
doc_type = row["document_type"]
|
|
151
|
+
content_replace: bool = doc_type in [ContentTypeEnum.IMAGE, ContentTypeEnum.STRUCTURED]
|
|
152
|
+
location: str = metadata["source_metadata"]["source_location"]
|
|
153
|
+
content = metadata["content"]
|
|
154
|
+
|
|
155
|
+
# If an embedding exists, update metadata and append the row for upload
|
|
156
|
+
if metadata.get("embedding") is not None:
|
|
157
|
+
logger.error(f"row type: {doc_type} - {location} - {len(content)}")
|
|
158
|
+
df_store_ledger.at[idx, "metadata"] = metadata
|
|
159
|
+
|
|
160
|
+
writer.append_row(
|
|
161
|
+
{
|
|
162
|
+
"text": location if content_replace else content,
|
|
163
|
+
"source": metadata["source_metadata"],
|
|
164
|
+
"content_metadata": metadata["content_metadata"],
|
|
165
|
+
"vector": metadata["embedding"],
|
|
166
|
+
}
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
writer.commit()
|
|
170
|
+
return df_store_ledger
|
|
171
|
+
|
|
172
|
+
except Exception as e:
|
|
173
|
+
err_msg = f"upload_embeddings: Error uploading embeddings. Original error: {e}"
|
|
174
|
+
logger.error(err_msg, exc_info=True)
|
|
175
|
+
raise type(e)(err_msg) from e
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def store_text_embeddings_internal(
|
|
179
|
+
df_store_ledger: pd.DataFrame,
|
|
180
|
+
task_config: Union[BaseModel, Dict[str, Any]],
|
|
181
|
+
store_config: EmbeddingStorageSchema,
|
|
182
|
+
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
183
|
+
) -> pd.DataFrame:
|
|
184
|
+
"""
|
|
185
|
+
Stores embeddings by uploading content from a DataFrame to MinIO.
|
|
186
|
+
|
|
187
|
+
This function prepares the necessary parameters for the upload based on the task configuration,
|
|
188
|
+
invokes the upload routine, and returns the updated DataFrame.
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
df_store_ledger : pd.DataFrame
|
|
193
|
+
DataFrame containing the data whose embeddings need to be stored.
|
|
194
|
+
task_config : Union[BaseModel, Dict[str, Any]]
|
|
195
|
+
Task configuration. If it is a Pydantic model, it will be converted to a dictionary.
|
|
196
|
+
store_config : Dict[str, Any]
|
|
197
|
+
Configuration parameters for storage (not directly used in the current implementation).
|
|
198
|
+
execution_trace_log : Optional[Dict[str, Any]], default=None
|
|
199
|
+
Optional dictionary for trace logging information.
|
|
200
|
+
|
|
201
|
+
Returns
|
|
202
|
+
-------
|
|
203
|
+
pd.DataFrame
|
|
204
|
+
The updated DataFrame after embeddings have been uploaded and metadata updated.
|
|
205
|
+
|
|
206
|
+
Raises
|
|
207
|
+
------
|
|
208
|
+
Exception
|
|
209
|
+
If any error occurs during the storage process, it is logged and re-raised with additional context.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
_ = store_config # Unused
|
|
213
|
+
_ = execution_trace_log # Unused
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
# Convert Pydantic model to dict if necessary
|
|
217
|
+
if isinstance(task_config, BaseModel):
|
|
218
|
+
task_config = task_config.model_dump()
|
|
219
|
+
|
|
220
|
+
# Set content types for embeddings and update params
|
|
221
|
+
content_types = {ContentTypeEnum.EMBEDDING: True}
|
|
222
|
+
params: Dict[str, Any] = task_config.get("params", {})
|
|
223
|
+
params["content_types"] = content_types
|
|
224
|
+
|
|
225
|
+
# Perform the upload of embeddings
|
|
226
|
+
df_store_ledger = _upload_text_embeddings(df_store_ledger, params)
|
|
227
|
+
|
|
228
|
+
result, execution_trace_log = df_store_ledger, {}
|
|
229
|
+
_ = execution_trace_log # Unused
|
|
230
|
+
|
|
231
|
+
return result
|
|
232
|
+
|
|
233
|
+
except Exception as e:
|
|
234
|
+
err_msg = f"_store_embeddings: Failed to store embeddings: {e}"
|
|
235
|
+
logger.error(err_msg, exc_info=True)
|
|
236
|
+
raise type(e)(err_msg) from e
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from upath import UPath
|
|
12
|
+
|
|
13
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _resolve_storage_root(storage_uri: str, storage_options: Dict[str, Any]) -> Tuple[UPath, str]:
|
|
19
|
+
"""
|
|
20
|
+
Construct a UPath instance rooted at the configured URI and return both the root path and protocol.
|
|
21
|
+
"""
|
|
22
|
+
storage_root = UPath(storage_uri, **storage_options)
|
|
23
|
+
protocol = storage_root._url.scheme or "file"
|
|
24
|
+
return storage_root, protocol
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _extract_image_type(doc_type: Any, metadata: Dict[str, Any]) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Determine the image type to use when writing the decoded content based on the document type.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def _normalize(raw_value: Any, default: str = "png") -> str:
|
|
33
|
+
if raw_value is None:
|
|
34
|
+
return default
|
|
35
|
+
if hasattr(raw_value, "value"):
|
|
36
|
+
return str(raw_value.value).lower()
|
|
37
|
+
string_value = str(raw_value).strip()
|
|
38
|
+
return string_value.lower() if string_value else default
|
|
39
|
+
|
|
40
|
+
if doc_type == ContentTypeEnum.IMAGE:
|
|
41
|
+
image_metadata = metadata.get("image_metadata", {})
|
|
42
|
+
return _normalize(image_metadata.get("image_type"))
|
|
43
|
+
|
|
44
|
+
if doc_type == ContentTypeEnum.STRUCTURED:
|
|
45
|
+
table_metadata = metadata.get("table_metadata", {})
|
|
46
|
+
return _normalize(table_metadata.get("image_type"))
|
|
47
|
+
|
|
48
|
+
return "png"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _build_destination_path(storage_root: UPath, source_id: str, row_index: int, image_type: str) -> Tuple[UPath, str]:
|
|
52
|
+
"""
|
|
53
|
+
Build the destination UPath for the decoded content and return both the destination and relative key.
|
|
54
|
+
"""
|
|
55
|
+
safe_source_name = os.path.basename(source_id.rstrip("/")) or "source"
|
|
56
|
+
clean_source_name = safe_source_name.replace("/", "_")
|
|
57
|
+
|
|
58
|
+
destination: UPath = storage_root / clean_source_name / f"{row_index}.{image_type}"
|
|
59
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
relative_key = destination.relative_to(storage_root).as_posix()
|
|
61
|
+
return destination, relative_key
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _upload_images_via_fsspec(df: pd.DataFrame, params: Dict[str, Any]) -> pd.DataFrame:
|
|
65
|
+
"""
|
|
66
|
+
Identifies content within a DataFrame and persists it using an fsspec-compatible filesystem, updating
|
|
67
|
+
metadata with the resulting URIs.
|
|
68
|
+
|
|
69
|
+
This function iterates over rows of the provided DataFrame. For rows whose "document_type" is listed
|
|
70
|
+
in the provided 'content_types' configuration, it decodes the base64-encoded content, writes the object
|
|
71
|
+
via fsspec/UPath, and updates the metadata with the resolved URL. Errors during individual row processing
|
|
72
|
+
are logged and skipped so the process continues for remaining rows.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
df : pd.DataFrame
|
|
77
|
+
The DataFrame containing rows with content and associated metadata.
|
|
78
|
+
params : Dict[str, Any]
|
|
79
|
+
A flat dictionary of configuration parameters for the upload. Expected keys include:
|
|
80
|
+
- "content_types": Dict mapping document types to booleans.
|
|
81
|
+
- "storage_uri": Base URI (file://, s3://, etc.) where images should be written.
|
|
82
|
+
- "storage_options": Optional dictionary forwarded to UPath/fsspec constructors.
|
|
83
|
+
- "public_base_url": Optional HTTP(s) base used to surface stored objects.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
pd.DataFrame
|
|
88
|
+
The updated DataFrame with metadata reflecting the uploaded URLs. Rows that encountered errors
|
|
89
|
+
during processing will remain unchanged.
|
|
90
|
+
|
|
91
|
+
Raises
|
|
92
|
+
------
|
|
93
|
+
ValueError
|
|
94
|
+
If the required "content_types" key is missing or is not a dictionary.
|
|
95
|
+
Exception
|
|
96
|
+
Propagates any critical exceptions not handled at the row level.
|
|
97
|
+
"""
|
|
98
|
+
# Validate required configuration
|
|
99
|
+
content_types = params.get("content_types")
|
|
100
|
+
if not isinstance(content_types, dict):
|
|
101
|
+
raise ValueError("Invalid configuration: 'content_types' must be provided as a dictionary in params")
|
|
102
|
+
|
|
103
|
+
storage_uri: Optional[str] = params.get("storage_uri")
|
|
104
|
+
if not storage_uri or not storage_uri.strip():
|
|
105
|
+
raise ValueError("`storage_uri` must be provided in task params.")
|
|
106
|
+
|
|
107
|
+
storage_options: Dict[str, Any] = params.get("storage_options") or {}
|
|
108
|
+
public_base_url: Optional[str] = params.get("public_base_url")
|
|
109
|
+
|
|
110
|
+
storage_root, protocol = _resolve_storage_root(storage_uri, storage_options)
|
|
111
|
+
|
|
112
|
+
# Process each row and attempt to upload images
|
|
113
|
+
for idx, row in df.iterrows():
|
|
114
|
+
try:
|
|
115
|
+
doc_type = row.get("document_type")
|
|
116
|
+
if doc_type not in content_types:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
metadata = row.get("metadata")
|
|
120
|
+
if not isinstance(metadata, dict):
|
|
121
|
+
logger.error("Row %s: 'metadata' is not a dictionary", idx)
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
# Validate required metadata fields
|
|
125
|
+
if "content" not in metadata:
|
|
126
|
+
logger.error("Row %s: missing 'content' in metadata", idx)
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
if "source_metadata" not in metadata or not isinstance(metadata["source_metadata"], dict):
|
|
130
|
+
logger.error("Row %s: missing or invalid 'source_metadata' in metadata", idx)
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
source_metadata = metadata["source_metadata"]
|
|
134
|
+
if "source_id" not in source_metadata:
|
|
135
|
+
logger.error("Row %s: missing 'source_id' in source_metadata", idx)
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
# Decode the content from base64
|
|
139
|
+
content = base64.b64decode(metadata["content"].encode())
|
|
140
|
+
source_id = source_metadata["source_id"]
|
|
141
|
+
|
|
142
|
+
image_type = _extract_image_type(doc_type, metadata)
|
|
143
|
+
|
|
144
|
+
# Construct destination file path
|
|
145
|
+
destination, relative_key = _build_destination_path(
|
|
146
|
+
storage_root=storage_root,
|
|
147
|
+
source_id=source_id,
|
|
148
|
+
row_index=idx,
|
|
149
|
+
image_type=image_type,
|
|
150
|
+
)
|
|
151
|
+
with destination.open("wb") as target_file:
|
|
152
|
+
target_file.write(content)
|
|
153
|
+
|
|
154
|
+
destination_uri = destination.as_uri()
|
|
155
|
+
public_url: Optional[str] = None
|
|
156
|
+
if public_base_url:
|
|
157
|
+
public_url = f"{public_base_url.rstrip('/')}/{relative_key}"
|
|
158
|
+
|
|
159
|
+
primary_uri = public_url or destination_uri
|
|
160
|
+
source_metadata["source_location"] = primary_uri
|
|
161
|
+
|
|
162
|
+
local_uri: Optional[str] = None
|
|
163
|
+
if protocol == "file":
|
|
164
|
+
local_uri = destination.path
|
|
165
|
+
source_metadata["local_source_location"] = local_uri
|
|
166
|
+
|
|
167
|
+
if doc_type == ContentTypeEnum.IMAGE:
|
|
168
|
+
logger.debug("Persisting image data for row %s", idx)
|
|
169
|
+
image_metadata = metadata.get("image_metadata", {})
|
|
170
|
+
if public_url is not None:
|
|
171
|
+
image_metadata["uploaded_image_url"] = public_url
|
|
172
|
+
if local_uri is not None:
|
|
173
|
+
image_metadata["uploaded_image_local_path"] = local_uri
|
|
174
|
+
metadata["image_metadata"] = image_metadata
|
|
175
|
+
elif doc_type == ContentTypeEnum.STRUCTURED:
|
|
176
|
+
logger.debug("Persisting structured image data for row %s", idx)
|
|
177
|
+
table_metadata = metadata.get("table_metadata", {})
|
|
178
|
+
if public_url is not None:
|
|
179
|
+
table_metadata["uploaded_image_url"] = public_url
|
|
180
|
+
if local_uri is not None:
|
|
181
|
+
table_metadata["uploaded_image_local_path"] = local_uri
|
|
182
|
+
metadata["table_metadata"] = table_metadata
|
|
183
|
+
|
|
184
|
+
df.at[idx, "metadata"] = metadata
|
|
185
|
+
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.exception("Failed to process row %s: %s", idx, e)
|
|
188
|
+
# Continue processing the remaining rows
|
|
189
|
+
|
|
190
|
+
return df
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def store_images_to_minio_internal(
|
|
194
|
+
df_storage_ledger: pd.DataFrame,
|
|
195
|
+
task_config: Dict[str, Any],
|
|
196
|
+
storage_config: Dict[str, Any],
|
|
197
|
+
execution_trace_log: Optional[List[Any]] = None,
|
|
198
|
+
) -> pd.DataFrame:
|
|
199
|
+
"""
|
|
200
|
+
Processes a storage ledger DataFrame to persist images (and structured content) via an fsspec-compatible
|
|
201
|
+
filesystem.
|
|
202
|
+
|
|
203
|
+
This function validates the input DataFrame and task configuration, then creates a mask to select rows
|
|
204
|
+
where the "document_type" is among the desired types specified in the configuration. If matching rows are
|
|
205
|
+
found, it calls the internal upload function to process and update the DataFrame; otherwise, it returns the
|
|
206
|
+
original DataFrame unmodified.
|
|
207
|
+
|
|
208
|
+
Parameters
|
|
209
|
+
----------
|
|
210
|
+
df_storage_ledger : pd.DataFrame
|
|
211
|
+
The DataFrame containing storage ledger information, which must include at least the columns
|
|
212
|
+
"document_type" and "metadata".
|
|
213
|
+
task_config : Dict[str, Any]
|
|
214
|
+
A flat dictionary containing configuration parameters for image storage. Expected to include the key
|
|
215
|
+
"content_types" (a dict mapping document types to booleans) along with `storage_uri`,
|
|
216
|
+
`storage_options`, and optional presentation hints such as `public_base_url`.
|
|
217
|
+
storage_config : Dict[str, Any]
|
|
218
|
+
A dictionary reserved for additional storage configuration (currently unused).
|
|
219
|
+
execution_trace_log : Optional[List[Any]], optional
|
|
220
|
+
An optional list for capturing execution trace details (currently unused), by default None.
|
|
221
|
+
|
|
222
|
+
Returns
|
|
223
|
+
-------
|
|
224
|
+
pd.DataFrame
|
|
225
|
+
The updated DataFrame after attempting to upload images for rows with matching document types. Rows
|
|
226
|
+
that do not match remain unchanged.
|
|
227
|
+
|
|
228
|
+
Raises
|
|
229
|
+
------
|
|
230
|
+
ValueError
|
|
231
|
+
If the input DataFrame is missing required columns or if the task configuration is invalid.
|
|
232
|
+
"""
|
|
233
|
+
# Validate that required keys and columns exist
|
|
234
|
+
if "content_types" not in task_config or not isinstance(task_config["content_types"], dict):
|
|
235
|
+
raise ValueError("Task configuration must include a valid 'content_types' dictionary.")
|
|
236
|
+
|
|
237
|
+
if "document_type" not in df_storage_ledger.columns:
|
|
238
|
+
raise ValueError("Input DataFrame must contain a 'document_type' column.")
|
|
239
|
+
|
|
240
|
+
content_types = task_config["content_types"]
|
|
241
|
+
|
|
242
|
+
# Create a mask for rows where "document_type" is one of the desired types
|
|
243
|
+
storage_obj_mask = df_storage_ledger["document_type"].isin(list(content_types.keys()))
|
|
244
|
+
if (~storage_obj_mask).all():
|
|
245
|
+
logger.debug("No storage objects matching %s found in the DataFrame.", content_types)
|
|
246
|
+
return df_storage_ledger
|
|
247
|
+
|
|
248
|
+
result, execution_trace_log = _upload_images_via_fsspec(df_storage_ledger, task_config), {}
|
|
249
|
+
_ = execution_trace_log
|
|
250
|
+
|
|
251
|
+
return result
|