nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,16 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ConfigDict, BaseModel
9
+ from pydantic import StrictBool
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class ImageDedupSchema(BaseModel):
15
+ raise_on_failure: StrictBool = False
16
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,28 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2022-2024, NVIDIA CORPORATION.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ import logging
20
+
21
+ from pydantic import ConfigDict, BaseModel
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class EmbeddingStorageSchema(BaseModel):
27
+ raise_on_failure: bool = False
28
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,30 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Copyright (c) 2022-2024, NVIDIA CORPORATION.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ import logging
20
+
21
+ from pydantic import ConfigDict, BaseModel
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class ImageStorageModuleSchema(BaseModel):
27
+ structured: bool = True
28
+ images: bool = True
29
+ raise_on_failure: bool = False
30
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,15 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ from pydantic import ConfigDict, BaseModel
7
+
8
+
9
+ class ImageCaptionExtractionSchema(BaseModel):
10
+ api_key: str = "api_key"
11
+ endpoint_url: str = "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions"
12
+ prompt: str = "Caption the content of this image:"
13
+ model_name: str = "meta/llama-3.2-11b-vision-instruct"
14
+ raise_on_failure: bool = False
15
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,17 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ConfigDict, BaseModel
9
+ from pydantic import StrictBool
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class ImageFilterSchema(BaseModel):
15
+ raise_on_failure: StrictBool = False
16
+ cpu_only: StrictBool = False
17
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,25 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ from pydantic import ConfigDict, BaseModel
9
+
10
+ from nv_ingest_api.util.logging.configuration import LogLevel
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class TextEmbeddingSchema(BaseModel):
16
+ api_key: str = "api_key"
17
+ batch_size: int = 4
18
+ embedding_model: str = "nvidia/nv-embedqa-e5-v5"
19
+ embedding_nim_endpoint: str = "http://embedding:8000/v1"
20
+ encoding_format: str = "float"
21
+ httpx_log_level: LogLevel = LogLevel.WARNING
22
+ input_type: str = "passage"
23
+ raise_on_failure: bool = False
24
+ truncate: str = "END"
25
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,22 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from pydantic import Field, BaseModel, field_validator
6
+
7
+ from typing import Optional
8
+
9
+ from typing_extensions import Annotated
10
+
11
+
12
+ class TextSplitterSchema(BaseModel):
13
+ tokenizer: Optional[str] = None
14
+ chunk_size: Annotated[int, Field(gt=0)] = 1024
15
+ chunk_overlap: Annotated[int, Field(ge=0)] = 150
16
+ raise_on_failure: bool = False
17
+
18
+ @field_validator("chunk_overlap")
19
+ def check_chunk_overlap(cls, v, values, **kwargs):
20
+ if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]:
21
+ raise ValueError("chunk_overlap must be less than chunk_size")
22
+ return v
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,236 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import os
7
+ from typing import Any, Union, Optional
8
+ from typing import Dict
9
+
10
+ import pandas as pd
11
+ from minio import Minio
12
+ from pymilvus import Collection
13
+ from pymilvus import connections
14
+ from pymilvus.bulk_writer.constants import BulkFileType
15
+ from pymilvus.bulk_writer.remote_bulk_writer import RemoteBulkWriter
16
+ from pydantic import BaseModel
17
+
18
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
19
+ from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ _DEFAULT_ENDPOINT = os.environ.get("MINIO_INTERNAL_ADDRESS", "minio:9000")
24
+ _DEFAULT_BUCKET_NAME = os.environ.get("MINIO_BUCKET", "nv-ingest")
25
+
26
+
27
+ def _upload_text_embeddings(df_store_ledger: pd.DataFrame, task_config: Dict[str, Any]) -> pd.DataFrame:
28
+ """
29
+ Uploads embeddings to MinIO for contents (e.g., images) contained in a DataFrame.
30
+ The image metadata in the "metadata" column is updated with the URL (or path) of the uploaded data.
31
+
32
+ This function performs the following steps:
33
+ 1. Initializes a MinIO client using the provided task configuration parameters.
34
+ 2. Connects to a Milvus instance and retrieves the collection schema.
35
+ 3. Ensures that the target bucket exists (creating it if necessary).
36
+ 4. Configures a RemoteBulkWriter to upload embedding data in PARQUET format.
37
+ 5. Iterates over each row in the DataFrame, updates the metadata with the bucket path, and appends
38
+ rows to the writer if an embedding is present.
39
+ 6. Commits the writer, finalizing the upload process.
40
+
41
+ Parameters
42
+ ----------
43
+ df_store_ledger : pd.DataFrame
44
+ DataFrame containing the data to upload. Each row is expected to have:
45
+ - A "metadata" column (a dictionary) that includes keys such as "content", "embedding",
46
+ "source_metadata", and "content_metadata".
47
+ - A "document_type" column indicating the type of document (e.g., IMAGE, STRUCTURED).
48
+ task_config : Dict[str, Any]
49
+ Dictionary of parameters for the upload. Expected keys include:
50
+ - "minio_access_key": Optional[str]
51
+ Access key for MinIO.
52
+ - "minio_secret_key": Optional[str]
53
+ Secret key for MinIO.
54
+ - "minio_endpoint": str, default _DEFAULT_ENDPOINT
55
+ MinIO endpoint URL.
56
+ - "minio_bucket_name": str, default _DEFAULT_BUCKET_NAME
57
+ Name of the bucket in MinIO.
58
+ - "minio_bucket_path": str, default "embeddings"
59
+ Path within the bucket where embeddings are stored.
60
+ - "minio_session_token": Optional[str]
61
+ (Optional) Session token for MinIO.
62
+ - "minio_secure": bool, default False
63
+ Whether to use a secure connection to MinIO.
64
+ - "minio_region": Optional[str]
65
+ (Optional) Region for the MinIO service.
66
+ - "milvus_address": str, default "milvus"
67
+ Address of the Milvus service.
68
+ - "milvus_uri": str, default "http://milvus:19530"
69
+ URI for Milvus.
70
+ - "milvus_host": str, default "milvus"
71
+ Host for Milvus.
72
+ - "milvus_port": int, default 19530
73
+ Port for Milvus.
74
+ - "collection_name": str, default "nv_ingest_collection"
75
+ Name of the Milvus collection from which to retrieve the schema.
76
+
77
+ Returns
78
+ -------
79
+ pd.DataFrame
80
+ The input DataFrame with updated "metadata" columns containing the uploaded embedding URL
81
+ (or bucket path).
82
+
83
+ Raises
84
+ ------
85
+ Exception
86
+ Propagates any exception encountered during the upload process, wrapping it with additional context.
87
+ """
88
+ try:
89
+ # Retrieve connection parameters for MinIO
90
+ minio_access_key: Optional[str] = task_config.get("minio_access_key")
91
+ minio_secret_key: Optional[str] = task_config.get("minio_secret_key")
92
+ minio_endpoint: str = task_config.get("minio_endpoint", _DEFAULT_ENDPOINT)
93
+ minio_bucket_name: str = task_config.get("minio_bucket_name", _DEFAULT_BUCKET_NAME)
94
+ minio_bucket_path: str = task_config.get("minio_bucket_path", "embeddings")
95
+
96
+ # Retrieve connection parameters for Milvus
97
+ milvus_address: str = task_config.get("milvus_address", "milvus")
98
+ milvus_uri: str = task_config.get("milvus_uri", "http://milvus:19530")
99
+ milvus_host: str = task_config.get("milvus_host", "milvus")
100
+ milvus_port: int = task_config.get("milvus_port", 19530)
101
+ milvus_collection_name: str = task_config.get("collection_name", "nv_ingest_collection")
102
+
103
+ # Initialize MinIO client
104
+ client = Minio(
105
+ minio_endpoint,
106
+ access_key=minio_access_key,
107
+ secret_key=minio_secret_key,
108
+ session_token=task_config.get("minio_session_token"),
109
+ secure=task_config.get("minio_secure", False),
110
+ region=task_config.get("minio_region"),
111
+ )
112
+
113
+ # Connect to Milvus and retrieve collection schema
114
+ connections.connect(
115
+ address=milvus_address,
116
+ uri=f"{milvus_uri}:{milvus_port}",
117
+ host=milvus_host,
118
+ port=milvus_port,
119
+ )
120
+ schema = Collection(milvus_collection_name).schema
121
+
122
+ # Ensure bucket exists
123
+ if not client.bucket_exists(minio_bucket_name):
124
+ client.make_bucket(minio_bucket_name)
125
+ logger.debug("Created bucket %s", minio_bucket_name)
126
+ else:
127
+ logger.debug("Bucket %s already exists", minio_bucket_name)
128
+
129
+ # Setup connection parameters for RemoteBulkWriter
130
+ conn = RemoteBulkWriter.ConnectParam(
131
+ endpoint=minio_endpoint,
132
+ access_key=minio_access_key,
133
+ secret_key=minio_secret_key,
134
+ bucket_name=minio_bucket_name,
135
+ secure=False,
136
+ )
137
+ writer = RemoteBulkWriter(
138
+ schema=schema,
139
+ remote_path=minio_bucket_path,
140
+ connect_param=conn,
141
+ file_type=BulkFileType.PARQUET,
142
+ )
143
+
144
+ # Process each row in the DataFrame
145
+ for idx, row in df_store_ledger.iterrows():
146
+ metadata: Dict[str, Any] = row["metadata"].copy()
147
+ # Update embedding metadata with the bucket path
148
+ metadata["embedding_metadata"] = {"uploaded_embedding_url": minio_bucket_path}
149
+
150
+ doc_type = row["document_type"]
151
+ content_replace: bool = doc_type in [ContentTypeEnum.IMAGE, ContentTypeEnum.STRUCTURED]
152
+ location: str = metadata["source_metadata"]["source_location"]
153
+ content = metadata["content"]
154
+
155
+ # If an embedding exists, update metadata and append the row for upload
156
+ if metadata.get("embedding") is not None:
157
+ logger.error(f"row type: {doc_type} - {location} - {len(content)}")
158
+ df_store_ledger.at[idx, "metadata"] = metadata
159
+
160
+ writer.append_row(
161
+ {
162
+ "text": location if content_replace else content,
163
+ "source": metadata["source_metadata"],
164
+ "content_metadata": metadata["content_metadata"],
165
+ "vector": metadata["embedding"],
166
+ }
167
+ )
168
+
169
+ writer.commit()
170
+ return df_store_ledger
171
+
172
+ except Exception as e:
173
+ err_msg = f"upload_embeddings: Error uploading embeddings. Original error: {e}"
174
+ logger.error(err_msg, exc_info=True)
175
+ raise type(e)(err_msg) from e
176
+
177
+
178
+ def store_text_embeddings_internal(
179
+ df_store_ledger: pd.DataFrame,
180
+ task_config: Union[BaseModel, Dict[str, Any]],
181
+ store_config: EmbeddingStorageSchema,
182
+ execution_trace_log: Optional[Dict[str, Any]] = None,
183
+ ) -> pd.DataFrame:
184
+ """
185
+ Stores embeddings by uploading content from a DataFrame to MinIO.
186
+
187
+ This function prepares the necessary parameters for the upload based on the task configuration,
188
+ invokes the upload routine, and returns the updated DataFrame.
189
+
190
+ Parameters
191
+ ----------
192
+ df_store_ledger : pd.DataFrame
193
+ DataFrame containing the data whose embeddings need to be stored.
194
+ task_config : Union[BaseModel, Dict[str, Any]]
195
+ Task configuration. If it is a Pydantic model, it will be converted to a dictionary.
196
+ store_config : Dict[str, Any]
197
+ Configuration parameters for storage (not directly used in the current implementation).
198
+ execution_trace_log : Optional[Dict[str, Any]], default=None
199
+ Optional dictionary for trace logging information.
200
+
201
+ Returns
202
+ -------
203
+ pd.DataFrame
204
+ The updated DataFrame after embeddings have been uploaded and metadata updated.
205
+
206
+ Raises
207
+ ------
208
+ Exception
209
+ If any error occurs during the storage process, it is logged and re-raised with additional context.
210
+ """
211
+
212
+ _ = store_config # Unused
213
+ _ = execution_trace_log # Unused
214
+
215
+ try:
216
+ # Convert Pydantic model to dict if necessary
217
+ if isinstance(task_config, BaseModel):
218
+ task_config = task_config.model_dump()
219
+
220
+ # Set content types for embeddings and update params
221
+ content_types = {ContentTypeEnum.EMBEDDING: True}
222
+ params: Dict[str, Any] = task_config.get("params", {})
223
+ params["content_types"] = content_types
224
+
225
+ # Perform the upload of embeddings
226
+ df_store_ledger = _upload_text_embeddings(df_store_ledger, params)
227
+
228
+ result, execution_trace_log = df_store_ledger, {}
229
+ _ = execution_trace_log # Unused
230
+
231
+ return result
232
+
233
+ except Exception as e:
234
+ err_msg = f"_store_embeddings: Failed to store embeddings: {e}"
235
+ logger.error(err_msg, exc_info=True)
236
+ raise type(e)(err_msg) from e
@@ -0,0 +1,232 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import base64
6
+ import logging
7
+ import os
8
+ from io import BytesIO
9
+ from typing import Any, List, Optional
10
+ from typing import Dict
11
+ from urllib.parse import quote
12
+
13
+ import pandas as pd
14
+ from minio import Minio
15
+
16
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # TODO: Move these into microservice_entrypoint.py to populate the stage and validate them using the pydantic schema
21
+ # on startup.
22
+ _DEFAULT_ENDPOINT = os.environ.get("MINIO_INTERNAL_ADDRESS", "minio:9000")
23
+ _DEFAULT_READ_ADDRESS = os.environ.get("MINIO_PUBLIC_ADDRESS", "http://minio:9000")
24
+ _DEFAULT_BUCKET_NAME = os.environ.get("MINIO_BUCKET", "nv-ingest")
25
+
26
+
27
+ def _ensure_bucket_exists(client: Minio, bucket_name: str) -> None:
28
+ """
29
+ Ensure that the specified bucket exists in MinIO, and create it if it does not.
30
+
31
+ Parameters
32
+ ----------
33
+ client : Minio
34
+ An instance of the Minio client.
35
+ bucket_name : str
36
+ The name of the bucket to check or create.
37
+ """
38
+ if not client.bucket_exists(bucket_name):
39
+ client.make_bucket(bucket_name)
40
+ logger.debug("Created bucket %s", bucket_name)
41
+ else:
42
+ logger.debug("Bucket %s already exists", bucket_name)
43
+
44
+
45
+ def _upload_images_to_minio(df: pd.DataFrame, params: Dict[str, Any]) -> pd.DataFrame:
46
+ """
47
+ Identifies content within a DataFrame and uploads it to MinIO, updating the metadata with the uploaded URL.
48
+
49
+ This function iterates over rows of the provided DataFrame. For rows whose "document_type" is listed
50
+ in the provided 'content_types' configuration, it decodes the base64-encoded content, uploads the object to
51
+ MinIO, and updates the metadata with the public URL. Errors during individual row processing are logged and
52
+ skipped, so the process continues for remaining rows.
53
+
54
+ Parameters
55
+ ----------
56
+ df : pd.DataFrame
57
+ The DataFrame containing rows with content and associated metadata.
58
+ params : Dict[str, Any]
59
+ A flat dictionary of configuration parameters for the upload. Expected keys include:
60
+ - "content_types": Dict mapping document types to booleans.
61
+ - "endpoint": URL for the MinIO service (optional; defaults to _DEFAULT_ENDPOINT).
62
+ - "bucket_name": Bucket name for storing objects (optional; defaults to _DEFAULT_BUCKET_NAME).
63
+ - "access_key": Access key for MinIO.
64
+ - "secret_key": Secret key for MinIO.
65
+ - "session_token": Session token for MinIO (optional).
66
+ - "secure": Boolean indicating if HTTPS should be used.
67
+ - "region": Region for the MinIO service (optional).
68
+
69
+ Returns
70
+ -------
71
+ pd.DataFrame
72
+ The updated DataFrame with metadata reflecting the uploaded URLs. Rows that encountered errors
73
+ during processing will remain unchanged.
74
+
75
+ Raises
76
+ ------
77
+ ValueError
78
+ If the required "content_types" key is missing or is not a dictionary.
79
+ Exception
80
+ Propagates any critical exceptions not handled at the row level.
81
+ """
82
+ # Validate required configuration
83
+ content_types = params.get("content_types")
84
+ if not isinstance(content_types, dict):
85
+ raise ValueError("Invalid configuration: 'content_types' must be provided as a dictionary in params")
86
+
87
+ endpoint: str = params.get("endpoint", _DEFAULT_ENDPOINT)
88
+ bucket_name: str = params.get("bucket_name", _DEFAULT_BUCKET_NAME)
89
+
90
+ # Initialize MinIO client
91
+ client = Minio(
92
+ endpoint,
93
+ access_key=params.get("access_key"),
94
+ secret_key=params.get("secret_key"),
95
+ session_token=params.get("session_token"),
96
+ secure=params.get("secure", False),
97
+ region=params.get("region"),
98
+ )
99
+
100
+ # Ensure the bucket exists
101
+ _ensure_bucket_exists(client, bucket_name)
102
+
103
+ # Process each row and attempt to upload images
104
+ for idx, row in df.iterrows():
105
+ try:
106
+ doc_type = row.get("document_type")
107
+ if doc_type not in content_types:
108
+ continue
109
+
110
+ metadata = row.get("metadata")
111
+ if not isinstance(metadata, dict):
112
+ logger.error("Row %s: 'metadata' is not a dictionary", idx)
113
+ continue
114
+
115
+ # Validate required metadata fields
116
+ if "content" not in metadata:
117
+ logger.error("Row %s: missing 'content' in metadata", idx)
118
+ continue
119
+ if "source_metadata" not in metadata or not isinstance(metadata["source_metadata"], dict):
120
+ logger.error("Row %s: missing or invalid 'source_metadata' in metadata", idx)
121
+ continue
122
+
123
+ source_metadata = metadata["source_metadata"]
124
+ if "source_id" not in source_metadata:
125
+ logger.error("Row %s: missing 'source_id' in source_metadata", idx)
126
+ continue
127
+
128
+ # Decode the content from base64
129
+ content = base64.b64decode(metadata["content"].encode())
130
+ source_id = source_metadata["source_id"]
131
+
132
+ # Determine image type (default to 'png')
133
+ image_type = "png"
134
+ if doc_type == ContentTypeEnum.IMAGE:
135
+ image_metadata = metadata.get("image_metadata", {})
136
+ image_type = image_metadata.get("image_type", "png")
137
+
138
+ # Construct destination file path
139
+ encoded_source_id = quote(source_id, safe="")
140
+ encoded_image_type = quote(image_type, safe="")
141
+ destination_file = f"{encoded_source_id}/{idx}.{encoded_image_type}"
142
+
143
+ # Upload the object to MinIO
144
+ source_file = BytesIO(content)
145
+ client.put_object(
146
+ bucket_name,
147
+ destination_file,
148
+ source_file,
149
+ length=len(content),
150
+ )
151
+
152
+ # Construct the public URL
153
+ public_url = f"{_DEFAULT_READ_ADDRESS}/{bucket_name}/{destination_file}"
154
+ source_metadata["source_location"] = public_url
155
+
156
+ if doc_type == ContentTypeEnum.IMAGE:
157
+ logger.debug("Storing image data to Minio for row %s", idx)
158
+ image_metadata = metadata.get("image_metadata", {})
159
+ image_metadata["uploaded_image_url"] = public_url
160
+ metadata["image_metadata"] = image_metadata
161
+ elif doc_type == ContentTypeEnum.STRUCTURED:
162
+ logger.debug("Storing structured image data to Minio for row %s", idx)
163
+ table_metadata = metadata.get("table_metadata", {})
164
+ table_metadata["uploaded_image_url"] = public_url
165
+ metadata["table_metadata"] = table_metadata
166
+
167
+ df.at[idx, "metadata"] = metadata
168
+
169
+ except Exception as e:
170
+ logger.exception("Failed to process row %s: %s", idx, e)
171
+ # Continue processing the remaining rows
172
+
173
+ return df
174
+
175
+
176
+ def store_images_to_minio_internal(
177
+ df_storage_ledger: pd.DataFrame,
178
+ task_config: Dict[str, Any],
179
+ storage_config: Dict[str, Any],
180
+ execution_trace_log: Optional[List[Any]] = None,
181
+ ) -> pd.DataFrame:
182
+ """
183
+ Processes a storage ledger DataFrame to upload images (and structured content) to MinIO.
184
+
185
+ This function validates the input DataFrame and task configuration, then creates a mask to select rows
186
+ where the "document_type" is among the desired types specified in the configuration. If matching rows are
187
+ found, it calls the internal upload function to process and update the DataFrame; otherwise, it returns the
188
+ original DataFrame unmodified.
189
+
190
+ Parameters
191
+ ----------
192
+ df_storage_ledger : pd.DataFrame
193
+ The DataFrame containing storage ledger information, which must include at least the columns
194
+ "document_type" and "metadata".
195
+ task_config : Dict[str, Any]
196
+ A flat dictionary containing configuration parameters for image storage. Expected to include the key
197
+ "content_types" (a dict mapping document types to booleans) along with connection and credential details.
198
+ storage_config : Dict[str, Any]
199
+ A dictionary reserved for additional storage configuration (currently unused).
200
+ execution_trace_log : Optional[List[Any]], optional
201
+ An optional list for capturing execution trace details (currently unused), by default None.
202
+
203
+ Returns
204
+ -------
205
+ pd.DataFrame
206
+ The updated DataFrame after attempting to upload images for rows with matching document types. Rows
207
+ that do not match remain unchanged.
208
+
209
+ Raises
210
+ ------
211
+ ValueError
212
+ If the input DataFrame is missing required columns or if the task configuration is invalid.
213
+ """
214
+ # Validate that required keys and columns exist
215
+ if "content_types" not in task_config or not isinstance(task_config["content_types"], dict):
216
+ raise ValueError("Task configuration must include a valid 'content_types' dictionary.")
217
+
218
+ if "document_type" not in df_storage_ledger.columns:
219
+ raise ValueError("Input DataFrame must contain a 'document_type' column.")
220
+
221
+ content_types = task_config["content_types"]
222
+
223
+ # Create a mask for rows where "document_type" is one of the desired types
224
+ storage_obj_mask = df_storage_ledger["document_type"].isin(list(content_types.keys()))
225
+ if (~storage_obj_mask).all():
226
+ logger.debug("No storage objects matching %s found in the DataFrame.", content_types)
227
+ return df_storage_ledger
228
+
229
+ result, execution_trace_log = _upload_images_to_minio(df_storage_ledger, task_config), {}
230
+ _ = execution_trace_log
231
+
232
+ return result
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0