nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,24 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from pydantic import Field, BaseModel, field_validator, ConfigDict
6
+
7
+ from typing import Optional
8
+
9
+
10
+ class TextSplitterSchema(BaseModel):
11
+ tokenizer: Optional[str] = None
12
+ chunk_size: int = Field(default=1024, gt=0)
13
+ chunk_overlap: int = Field(default=150, ge=0)
14
+ raise_on_failure: bool = False
15
+
16
+ @field_validator("chunk_overlap")
17
+ @classmethod
18
+ def check_chunk_overlap(cls, v, values):
19
+ chunk_size = values.data.get("chunk_size")
20
+ if chunk_size is not None and v >= chunk_size:
21
+ raise ValueError("chunk_overlap must be less than chunk_size")
22
+ return v
23
+
24
+ model_config = ConfigDict(extra="forbid")
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,236 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import os
7
+ from typing import Any, Union, Optional
8
+ from typing import Dict
9
+
10
+ import pandas as pd
11
+ from minio import Minio
12
+ from pymilvus import Collection
13
+ from pymilvus import connections
14
+ from pymilvus.bulk_writer.constants import BulkFileType
15
+ from pymilvus.bulk_writer.remote_bulk_writer import RemoteBulkWriter
16
+ from pydantic import BaseModel
17
+
18
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
19
+ from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ _DEFAULT_ENDPOINT = os.environ.get("MINIO_INTERNAL_ADDRESS", "minio:9000")
24
+ _DEFAULT_BUCKET_NAME = os.environ.get("MINIO_BUCKET", "nv-ingest")
25
+
26
+
27
+ def _upload_text_embeddings(df_store_ledger: pd.DataFrame, task_config: Dict[str, Any]) -> pd.DataFrame:
28
+ """
29
+ Uploads embeddings to MinIO for contents (e.g., images) contained in a DataFrame.
30
+ The image metadata in the "metadata" column is updated with the URL (or path) of the uploaded data.
31
+
32
+ This function performs the following steps:
33
+ 1. Initializes a MinIO client using the provided task configuration parameters.
34
+ 2. Connects to a Milvus instance and retrieves the collection schema.
35
+ 3. Ensures that the target bucket exists (creating it if necessary).
36
+ 4. Configures a RemoteBulkWriter to upload embedding data in PARQUET format.
37
+ 5. Iterates over each row in the DataFrame, updates the metadata with the bucket path, and appends
38
+ rows to the writer if an embedding is present.
39
+ 6. Commits the writer, finalizing the upload process.
40
+
41
+ Parameters
42
+ ----------
43
+ df_store_ledger : pd.DataFrame
44
+ DataFrame containing the data to upload. Each row is expected to have:
45
+ - A "metadata" column (a dictionary) that includes keys such as "content", "embedding",
46
+ "source_metadata", and "content_metadata".
47
+ - A "document_type" column indicating the type of document (e.g., IMAGE, STRUCTURED).
48
+ task_config : Dict[str, Any]
49
+ Dictionary of parameters for the upload. Expected keys include:
50
+ - "minio_access_key": Optional[str]
51
+ Access key for MinIO.
52
+ - "minio_secret_key": Optional[str]
53
+ Secret key for MinIO.
54
+ - "minio_endpoint": str, default _DEFAULT_ENDPOINT
55
+ MinIO endpoint URL.
56
+ - "minio_bucket_name": str, default _DEFAULT_BUCKET_NAME
57
+ Name of the bucket in MinIO.
58
+ - "minio_bucket_path": str, default "embeddings"
59
+ Path within the bucket where embeddings are stored.
60
+ - "minio_session_token": Optional[str]
61
+ (Optional) Session token for MinIO.
62
+ - "minio_secure": bool, default False
63
+ Whether to use a secure connection to MinIO.
64
+ - "minio_region": Optional[str]
65
+ (Optional) Region for the MinIO service.
66
+ - "milvus_address": str, default "milvus"
67
+ Address of the Milvus service.
68
+ - "milvus_uri": str, default "http://milvus:19530"
69
+ URI for Milvus.
70
+ - "milvus_host": str, default "milvus"
71
+ Host for Milvus.
72
+ - "milvus_port": int, default 19530
73
+ Port for Milvus.
74
+ - "collection_name": str, default "nv_ingest_collection"
75
+ Name of the Milvus collection from which to retrieve the schema.
76
+
77
+ Returns
78
+ -------
79
+ pd.DataFrame
80
+ The input DataFrame with updated "metadata" columns containing the uploaded embedding URL
81
+ (or bucket path).
82
+
83
+ Raises
84
+ ------
85
+ Exception
86
+ Propagates any exception encountered during the upload process, wrapping it with additional context.
87
+ """
88
+ try:
89
+ # Retrieve connection parameters for MinIO
90
+ minio_access_key: Optional[str] = task_config.get("minio_access_key")
91
+ minio_secret_key: Optional[str] = task_config.get("minio_secret_key")
92
+ minio_endpoint: str = task_config.get("minio_endpoint", _DEFAULT_ENDPOINT)
93
+ minio_bucket_name: str = task_config.get("minio_bucket_name", _DEFAULT_BUCKET_NAME)
94
+ minio_bucket_path: str = task_config.get("minio_bucket_path", "embeddings")
95
+
96
+ # Retrieve connection parameters for Milvus
97
+ milvus_address: str = task_config.get("milvus_address", "milvus")
98
+ milvus_uri: str = task_config.get("milvus_uri", "http://milvus:19530")
99
+ milvus_host: str = task_config.get("milvus_host", "milvus")
100
+ milvus_port: int = task_config.get("milvus_port", 19530)
101
+ milvus_collection_name: str = task_config.get("collection_name", "nv_ingest_collection")
102
+
103
+ # Initialize MinIO client
104
+ client = Minio(
105
+ minio_endpoint,
106
+ access_key=minio_access_key,
107
+ secret_key=minio_secret_key,
108
+ session_token=task_config.get("minio_session_token"),
109
+ secure=task_config.get("minio_secure", False),
110
+ region=task_config.get("minio_region"),
111
+ )
112
+
113
+ # Connect to Milvus and retrieve collection schema
114
+ connections.connect(
115
+ address=milvus_address,
116
+ uri=f"{milvus_uri}:{milvus_port}",
117
+ host=milvus_host,
118
+ port=milvus_port,
119
+ )
120
+ schema = Collection(milvus_collection_name).schema
121
+
122
+ # Ensure bucket exists
123
+ if not client.bucket_exists(minio_bucket_name):
124
+ client.make_bucket(minio_bucket_name)
125
+ logger.debug("Created bucket %s", minio_bucket_name)
126
+ else:
127
+ logger.debug("Bucket %s already exists", minio_bucket_name)
128
+
129
+ # Setup connection parameters for RemoteBulkWriter
130
+ conn = RemoteBulkWriter.ConnectParam(
131
+ endpoint=minio_endpoint,
132
+ access_key=minio_access_key,
133
+ secret_key=minio_secret_key,
134
+ bucket_name=minio_bucket_name,
135
+ secure=False,
136
+ )
137
+ writer = RemoteBulkWriter(
138
+ schema=schema,
139
+ remote_path=minio_bucket_path,
140
+ connect_param=conn,
141
+ file_type=BulkFileType.PARQUET,
142
+ )
143
+
144
+ # Process each row in the DataFrame
145
+ for idx, row in df_store_ledger.iterrows():
146
+ metadata: Dict[str, Any] = row["metadata"].copy()
147
+ # Update embedding metadata with the bucket path
148
+ metadata["embedding_metadata"] = {"uploaded_embedding_url": minio_bucket_path}
149
+
150
+ doc_type = row["document_type"]
151
+ content_replace: bool = doc_type in [ContentTypeEnum.IMAGE, ContentTypeEnum.STRUCTURED]
152
+ location: str = metadata["source_metadata"]["source_location"]
153
+ content = metadata["content"]
154
+
155
+ # If an embedding exists, update metadata and append the row for upload
156
+ if metadata.get("embedding") is not None:
157
+ logger.error(f"row type: {doc_type} - {location} - {len(content)}")
158
+ df_store_ledger.at[idx, "metadata"] = metadata
159
+
160
+ writer.append_row(
161
+ {
162
+ "text": location if content_replace else content,
163
+ "source": metadata["source_metadata"],
164
+ "content_metadata": metadata["content_metadata"],
165
+ "vector": metadata["embedding"],
166
+ }
167
+ )
168
+
169
+ writer.commit()
170
+ return df_store_ledger
171
+
172
+ except Exception as e:
173
+ err_msg = f"upload_embeddings: Error uploading embeddings. Original error: {e}"
174
+ logger.error(err_msg, exc_info=True)
175
+ raise type(e)(err_msg) from e
176
+
177
+
178
+ def store_text_embeddings_internal(
179
+ df_store_ledger: pd.DataFrame,
180
+ task_config: Union[BaseModel, Dict[str, Any]],
181
+ store_config: EmbeddingStorageSchema,
182
+ execution_trace_log: Optional[Dict[str, Any]] = None,
183
+ ) -> pd.DataFrame:
184
+ """
185
+ Stores embeddings by uploading content from a DataFrame to MinIO.
186
+
187
+ This function prepares the necessary parameters for the upload based on the task configuration,
188
+ invokes the upload routine, and returns the updated DataFrame.
189
+
190
+ Parameters
191
+ ----------
192
+ df_store_ledger : pd.DataFrame
193
+ DataFrame containing the data whose embeddings need to be stored.
194
+ task_config : Union[BaseModel, Dict[str, Any]]
195
+ Task configuration. If it is a Pydantic model, it will be converted to a dictionary.
196
+ store_config : Dict[str, Any]
197
+ Configuration parameters for storage (not directly used in the current implementation).
198
+ execution_trace_log : Optional[Dict[str, Any]], default=None
199
+ Optional dictionary for trace logging information.
200
+
201
+ Returns
202
+ -------
203
+ pd.DataFrame
204
+ The updated DataFrame after embeddings have been uploaded and metadata updated.
205
+
206
+ Raises
207
+ ------
208
+ Exception
209
+ If any error occurs during the storage process, it is logged and re-raised with additional context.
210
+ """
211
+
212
+ _ = store_config # Unused
213
+ _ = execution_trace_log # Unused
214
+
215
+ try:
216
+ # Convert Pydantic model to dict if necessary
217
+ if isinstance(task_config, BaseModel):
218
+ task_config = task_config.model_dump()
219
+
220
+ # Set content types for embeddings and update params
221
+ content_types = {ContentTypeEnum.EMBEDDING: True}
222
+ params: Dict[str, Any] = task_config.get("params", {})
223
+ params["content_types"] = content_types
224
+
225
+ # Perform the upload of embeddings
226
+ df_store_ledger = _upload_text_embeddings(df_store_ledger, params)
227
+
228
+ result, execution_trace_log = df_store_ledger, {}
229
+ _ = execution_trace_log # Unused
230
+
231
+ return result
232
+
233
+ except Exception as e:
234
+ err_msg = f"_store_embeddings: Failed to store embeddings: {e}"
235
+ logger.error(err_msg, exc_info=True)
236
+ raise type(e)(err_msg) from e
@@ -0,0 +1,251 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import base64
6
+ import logging
7
+ import os
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+ import pandas as pd
11
+ from upath import UPath
12
+
13
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def _resolve_storage_root(storage_uri: str, storage_options: Dict[str, Any]) -> Tuple[UPath, str]:
19
+ """
20
+ Construct a UPath instance rooted at the configured URI and return both the root path and protocol.
21
+ """
22
+ storage_root = UPath(storage_uri, **storage_options)
23
+ protocol = storage_root._url.scheme or "file"
24
+ return storage_root, protocol
25
+
26
+
27
+ def _extract_image_type(doc_type: Any, metadata: Dict[str, Any]) -> str:
28
+ """
29
+ Determine the image type to use when writing the decoded content based on the document type.
30
+ """
31
+
32
+ def _normalize(raw_value: Any, default: str = "png") -> str:
33
+ if raw_value is None:
34
+ return default
35
+ if hasattr(raw_value, "value"):
36
+ return str(raw_value.value).lower()
37
+ string_value = str(raw_value).strip()
38
+ return string_value.lower() if string_value else default
39
+
40
+ if doc_type == ContentTypeEnum.IMAGE:
41
+ image_metadata = metadata.get("image_metadata", {})
42
+ return _normalize(image_metadata.get("image_type"))
43
+
44
+ if doc_type == ContentTypeEnum.STRUCTURED:
45
+ table_metadata = metadata.get("table_metadata", {})
46
+ return _normalize(table_metadata.get("image_type"))
47
+
48
+ return "png"
49
+
50
+
51
+ def _build_destination_path(storage_root: UPath, source_id: str, row_index: int, image_type: str) -> Tuple[UPath, str]:
52
+ """
53
+ Build the destination UPath for the decoded content and return both the destination and relative key.
54
+ """
55
+ safe_source_name = os.path.basename(source_id.rstrip("/")) or "source"
56
+ clean_source_name = safe_source_name.replace("/", "_")
57
+
58
+ destination: UPath = storage_root / clean_source_name / f"{row_index}.{image_type}"
59
+ destination.parent.mkdir(parents=True, exist_ok=True)
60
+ relative_key = destination.relative_to(storage_root).as_posix()
61
+ return destination, relative_key
62
+
63
+
64
+ def _upload_images_via_fsspec(df: pd.DataFrame, params: Dict[str, Any]) -> pd.DataFrame:
65
+ """
66
+ Identifies content within a DataFrame and persists it using an fsspec-compatible filesystem, updating
67
+ metadata with the resulting URIs.
68
+
69
+ This function iterates over rows of the provided DataFrame. For rows whose "document_type" is listed
70
+ in the provided 'content_types' configuration, it decodes the base64-encoded content, writes the object
71
+ via fsspec/UPath, and updates the metadata with the resolved URL. Errors during individual row processing
72
+ are logged and skipped so the process continues for remaining rows.
73
+
74
+ Parameters
75
+ ----------
76
+ df : pd.DataFrame
77
+ The DataFrame containing rows with content and associated metadata.
78
+ params : Dict[str, Any]
79
+ A flat dictionary of configuration parameters for the upload. Expected keys include:
80
+ - "content_types": Dict mapping document types to booleans.
81
+ - "storage_uri": Base URI (file://, s3://, etc.) where images should be written.
82
+ - "storage_options": Optional dictionary forwarded to UPath/fsspec constructors.
83
+ - "public_base_url": Optional HTTP(s) base used to surface stored objects.
84
+
85
+ Returns
86
+ -------
87
+ pd.DataFrame
88
+ The updated DataFrame with metadata reflecting the uploaded URLs. Rows that encountered errors
89
+ during processing will remain unchanged.
90
+
91
+ Raises
92
+ ------
93
+ ValueError
94
+ If the required "content_types" key is missing or is not a dictionary.
95
+ Exception
96
+ Propagates any critical exceptions not handled at the row level.
97
+ """
98
+ # Validate required configuration
99
+ content_types = params.get("content_types")
100
+ if not isinstance(content_types, dict):
101
+ raise ValueError("Invalid configuration: 'content_types' must be provided as a dictionary in params")
102
+
103
+ storage_uri: Optional[str] = params.get("storage_uri")
104
+ if not storage_uri or not storage_uri.strip():
105
+ raise ValueError("`storage_uri` must be provided in task params.")
106
+
107
+ storage_options: Dict[str, Any] = params.get("storage_options") or {}
108
+ public_base_url: Optional[str] = params.get("public_base_url")
109
+
110
+ storage_root, protocol = _resolve_storage_root(storage_uri, storage_options)
111
+
112
+ # Process each row and attempt to upload images
113
+ for idx, row in df.iterrows():
114
+ try:
115
+ doc_type = row.get("document_type")
116
+ if doc_type not in content_types:
117
+ continue
118
+
119
+ metadata = row.get("metadata")
120
+ if not isinstance(metadata, dict):
121
+ logger.error("Row %s: 'metadata' is not a dictionary", idx)
122
+ continue
123
+
124
+ # Validate required metadata fields
125
+ if "content" not in metadata:
126
+ logger.error("Row %s: missing 'content' in metadata", idx)
127
+ continue
128
+
129
+ if "source_metadata" not in metadata or not isinstance(metadata["source_metadata"], dict):
130
+ logger.error("Row %s: missing or invalid 'source_metadata' in metadata", idx)
131
+ continue
132
+
133
+ source_metadata = metadata["source_metadata"]
134
+ if "source_id" not in source_metadata:
135
+ logger.error("Row %s: missing 'source_id' in source_metadata", idx)
136
+ continue
137
+
138
+ # Decode the content from base64
139
+ content = base64.b64decode(metadata["content"].encode())
140
+ source_id = source_metadata["source_id"]
141
+
142
+ image_type = _extract_image_type(doc_type, metadata)
143
+
144
+ # Construct destination file path
145
+ destination, relative_key = _build_destination_path(
146
+ storage_root=storage_root,
147
+ source_id=source_id,
148
+ row_index=idx,
149
+ image_type=image_type,
150
+ )
151
+ with destination.open("wb") as target_file:
152
+ target_file.write(content)
153
+
154
+ destination_uri = destination.as_uri()
155
+ public_url: Optional[str] = None
156
+ if public_base_url:
157
+ public_url = f"{public_base_url.rstrip('/')}/{relative_key}"
158
+
159
+ primary_uri = public_url or destination_uri
160
+ source_metadata["source_location"] = primary_uri
161
+
162
+ local_uri: Optional[str] = None
163
+ if protocol == "file":
164
+ local_uri = destination.path
165
+ source_metadata["local_source_location"] = local_uri
166
+
167
+ if doc_type == ContentTypeEnum.IMAGE:
168
+ logger.debug("Persisting image data for row %s", idx)
169
+ image_metadata = metadata.get("image_metadata", {})
170
+ if public_url is not None:
171
+ image_metadata["uploaded_image_url"] = public_url
172
+ if local_uri is not None:
173
+ image_metadata["uploaded_image_local_path"] = local_uri
174
+ metadata["image_metadata"] = image_metadata
175
+ elif doc_type == ContentTypeEnum.STRUCTURED:
176
+ logger.debug("Persisting structured image data for row %s", idx)
177
+ table_metadata = metadata.get("table_metadata", {})
178
+ if public_url is not None:
179
+ table_metadata["uploaded_image_url"] = public_url
180
+ if local_uri is not None:
181
+ table_metadata["uploaded_image_local_path"] = local_uri
182
+ metadata["table_metadata"] = table_metadata
183
+
184
+ df.at[idx, "metadata"] = metadata
185
+
186
+ except Exception as e:
187
+ logger.exception("Failed to process row %s: %s", idx, e)
188
+ # Continue processing the remaining rows
189
+
190
+ return df
191
+
192
+
193
+ def store_images_to_minio_internal(
194
+ df_storage_ledger: pd.DataFrame,
195
+ task_config: Dict[str, Any],
196
+ storage_config: Dict[str, Any],
197
+ execution_trace_log: Optional[List[Any]] = None,
198
+ ) -> pd.DataFrame:
199
+ """
200
+ Processes a storage ledger DataFrame to persist images (and structured content) via an fsspec-compatible
201
+ filesystem.
202
+
203
+ This function validates the input DataFrame and task configuration, then creates a mask to select rows
204
+ where the "document_type" is among the desired types specified in the configuration. If matching rows are
205
+ found, it calls the internal upload function to process and update the DataFrame; otherwise, it returns the
206
+ original DataFrame unmodified.
207
+
208
+ Parameters
209
+ ----------
210
+ df_storage_ledger : pd.DataFrame
211
+ The DataFrame containing storage ledger information, which must include at least the columns
212
+ "document_type" and "metadata".
213
+ task_config : Dict[str, Any]
214
+ A flat dictionary containing configuration parameters for image storage. Expected to include the key
215
+ "content_types" (a dict mapping document types to booleans) along with `storage_uri`,
216
+ `storage_options`, and optional presentation hints such as `public_base_url`.
217
+ storage_config : Dict[str, Any]
218
+ A dictionary reserved for additional storage configuration (currently unused).
219
+ execution_trace_log : Optional[List[Any]], optional
220
+ An optional list for capturing execution trace details (currently unused), by default None.
221
+
222
+ Returns
223
+ -------
224
+ pd.DataFrame
225
+ The updated DataFrame after attempting to upload images for rows with matching document types. Rows
226
+ that do not match remain unchanged.
227
+
228
+ Raises
229
+ ------
230
+ ValueError
231
+ If the input DataFrame is missing required columns or if the task configuration is invalid.
232
+ """
233
+ # Validate that required keys and columns exist
234
+ if "content_types" not in task_config or not isinstance(task_config["content_types"], dict):
235
+ raise ValueError("Task configuration must include a valid 'content_types' dictionary.")
236
+
237
+ if "document_type" not in df_storage_ledger.columns:
238
+ raise ValueError("Input DataFrame must contain a 'document_type' column.")
239
+
240
+ content_types = task_config["content_types"]
241
+
242
+ # Create a mask for rows where "document_type" is one of the desired types
243
+ storage_obj_mask = df_storage_ledger["document_type"].isin(list(content_types.keys()))
244
+ if (~storage_obj_mask).all():
245
+ logger.debug("No storage objects matching %s found in the DataFrame.", content_types)
246
+ return df_storage_ledger
247
+
248
+ result, execution_trace_log = _upload_images_via_fsspec(df_storage_ledger, task_config), {}
249
+ _ = execution_trace_log
250
+
251
+ return result
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0