nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,702 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from functools import partial
8
+ from typing import Any, Dict, Tuple, Optional, Iterable, List
9
+ from urllib.parse import urlparse
10
+
11
+ import glom
12
+ import pandas as pd
13
+
14
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
15
+ from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
16
+ from nv_ingest_api.util.nim import infer_microservice
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Reduce SDK HTTP logging verbosity so request/response logs are not emitted
22
+ logging.getLogger("httpx").setLevel(logging.ERROR)
23
+ logging.getLogger("httpcore").setLevel(logging.ERROR)
24
+
25
+
26
+ MULTI_MODAL_MODELS = ["llama-3.2-nemoretriever-1b-vlm-embed-v1"]
27
+
28
+
29
+ # ------------------------------------------------------------------------------
30
+ # Asynchronous Embedding Requests
31
+ # ------------------------------------------------------------------------------
32
+
33
+
34
+ def _make_async_request(
35
+ prompts: List[str],
36
+ api_key: str,
37
+ embedding_nim_endpoint: str,
38
+ embedding_model: str,
39
+ encoding_format: str,
40
+ input_type: str,
41
+ truncate: str,
42
+ filter_errors: bool,
43
+ modalities: Optional[List[str]] = None,
44
+ dimensions: Optional[int] = None,
45
+ ) -> list:
46
+ """
47
+ Interacts directly with the NIM embedding service to calculate embeddings for a batch of prompts.
48
+
49
+ Parameters
50
+ ----------
51
+ prompts : List[str]
52
+ A list of prompt strings for which embeddings are to be calculated.
53
+ api_key : str
54
+ API key for authentication with the embedding service.
55
+ embedding_nim_endpoint : str
56
+ Base URL for the NIM embedding service.
57
+ embedding_model : str
58
+ The model to use for generating embeddings.
59
+ encoding_format : str
60
+ The desired encoding format.
61
+ input_type : str
62
+ The type of input data.
63
+ truncate : str
64
+ Truncation setting for the input data.
65
+ filter_errors : bool
66
+ Flag indicating whether to filter errors in the response.
67
+
68
+ Returns
69
+ -------
70
+ list
71
+ A dictionary with keys "embedding" (the embedding results) and "info_msg" (any error info).
72
+
73
+ Raises
74
+ ------
75
+ RuntimeError
76
+ If an error occurs during the embedding request, with an info message attached.
77
+ """
78
+ response = {}
79
+
80
+ try:
81
+ # Normalize API key to avoid sending an empty bearer token via SDK internals
82
+ _token = (api_key or "").strip()
83
+ _api_key = _token if _token else "<no key provided>"
84
+
85
+ resp = infer_microservice(
86
+ prompts,
87
+ embedding_model,
88
+ embedding_endpoint=embedding_nim_endpoint,
89
+ nvidia_api_key=_api_key,
90
+ input_type=input_type,
91
+ truncate=truncate,
92
+ batch_size=8191,
93
+ grpc="http" not in urlparse(embedding_nim_endpoint).scheme,
94
+ input_names=["text"],
95
+ output_names=["embeddings"],
96
+ dtypes=["BYTES"],
97
+ )
98
+
99
+ response["embedding"] = resp
100
+ response["info_msg"] = None
101
+
102
+ except Exception as err:
103
+ # Truncate error message to prevent memory blowup from large text content
104
+ err_str = str(err)
105
+ if len(err_str) > 500:
106
+ truncated_err = err_str[:200] + "... [truncated to prevent memory blowup] ..." + err_str[-100:]
107
+ else:
108
+ truncated_err = err_str
109
+
110
+ raise RuntimeError(f"Embedding error occurred: {truncated_err}") from err
111
+
112
+ return response
113
+
114
+
115
+ def _async_request_handler(
116
+ prompts: List[str],
117
+ api_key: str,
118
+ embedding_nim_endpoint: str,
119
+ embedding_model: str,
120
+ encoding_format: str,
121
+ input_type: str,
122
+ truncate: str,
123
+ filter_errors: bool,
124
+ modalities: Optional[List[str]] = None,
125
+ dimensions: Optional[int] = None,
126
+ ) -> List[dict]:
127
+ """
128
+ Gathers calculated embedding results from the NIM embedding service concurrently.
129
+
130
+ Parameters
131
+ ----------
132
+ prompts : List[str]
133
+ A list of prompt batches.
134
+ api_key : str
135
+ API key for authentication.
136
+ embedding_nim_endpoint : str
137
+ Base URL for the NIM embedding service.
138
+ embedding_model : str
139
+ The model to use for generating embeddings.
140
+ encoding_format : str
141
+ The desired encoding format.
142
+ input_type : str
143
+ The type of input data.
144
+ truncate : str
145
+ Truncation setting for the input data.
146
+ filter_errors : bool
147
+ Flag indicating whether to filter errors in the response.
148
+
149
+ Returns
150
+ -------
151
+ List[dict]
152
+ A list of response dictionaries from the embedding service.
153
+ """
154
+ if modalities is None:
155
+ modalities = [None] * len(prompts)
156
+
157
+ with ThreadPoolExecutor() as executor:
158
+ futures = [
159
+ executor.submit(
160
+ _make_async_request,
161
+ prompts=prompt_batch,
162
+ api_key=api_key,
163
+ embedding_nim_endpoint=embedding_nim_endpoint,
164
+ embedding_model=embedding_model,
165
+ encoding_format=encoding_format,
166
+ input_type=input_type,
167
+ truncate=truncate,
168
+ filter_errors=filter_errors,
169
+ modalities=modality_batch,
170
+ dimensions=dimensions,
171
+ )
172
+ for prompt_batch, modality_batch in zip(prompts, modalities)
173
+ ]
174
+ results = [future.result() for future in futures]
175
+
176
+ return results
177
+
178
+
179
+ def _async_runner(
180
+ prompts: List[str],
181
+ api_key: str,
182
+ embedding_nim_endpoint: str,
183
+ embedding_model: str,
184
+ encoding_format: str,
185
+ input_type: str,
186
+ truncate: str,
187
+ filter_errors: bool,
188
+ modalities: Optional[List[str]] = None,
189
+ dimensions: Optional[int] = None,
190
+ ) -> dict:
191
+ """
192
+ Concurrently launches all NIM embedding requests and flattens the results.
193
+
194
+ Parameters
195
+ ----------
196
+ prompts : List[str]
197
+ A list of prompt batches.
198
+ api_key : str
199
+ API key for authentication.
200
+ embedding_nim_endpoint : str
201
+ Base URL for the NIM embedding service.
202
+ embedding_model : str
203
+ The model to use for generating embeddings.
204
+ encoding_format : str
205
+ The desired encoding format.
206
+ input_type : str
207
+ The type of input data.
208
+ truncate : str
209
+ Truncation setting for the input data.
210
+ filter_errors : bool
211
+ Flag indicating whether to filter errors in the response.
212
+
213
+ Returns
214
+ -------
215
+ dict
216
+ A dictionary with keys "embeddings" (flattened embedding results) and "info_msgs" (error messages).
217
+ """
218
+ results = _async_request_handler(
219
+ prompts,
220
+ api_key,
221
+ embedding_nim_endpoint,
222
+ embedding_model,
223
+ encoding_format,
224
+ input_type,
225
+ truncate,
226
+ filter_errors,
227
+ modalities=modalities,
228
+ dimensions=dimensions,
229
+ )
230
+
231
+ flat_results = {"embeddings": [], "info_msgs": []}
232
+ for batch_dict in results:
233
+ info_msg = batch_dict["info_msg"]
234
+ for embedding in batch_dict["embedding"]:
235
+ if not isinstance(embedding, list):
236
+ if embedding is not None:
237
+ flat_results["embeddings"].append(embedding.embedding)
238
+ else:
239
+ flat_results["embeddings"].append(embedding)
240
+ else:
241
+ flat_results["embeddings"].append(embedding)
242
+ flat_results["info_msgs"].append(info_msg)
243
+
244
+ return flat_results
245
+
246
+
247
+ # ------------------------------------------------------------------------------
248
+ # Pandas UDFs for Content Extraction
249
+ # ------------------------------------------------------------------------------
250
+
251
+
252
+ def _add_embeddings(row, embeddings, info_msgs):
253
+ """
254
+ Updates a DataFrame row with embedding data and associated error info.
255
+ Ensures the 'embedding' field is always present, even if None.
256
+
257
+ Parameters
258
+ ----------
259
+ row : pandas.Series
260
+ A row of the DataFrame.
261
+ embeddings : dict
262
+ Dictionary mapping row indices to embeddings.
263
+ info_msgs : dict
264
+ Dictionary mapping row indices to info message dicts.
265
+
266
+ Returns
267
+ -------
268
+ pandas.Series
269
+ The updated row with 'embedding', 'info_message_metadata', and
270
+ '_contains_embeddings' appropriately set.
271
+ """
272
+ embedding = embeddings.get(row.name, None)
273
+ info_msg = info_msgs.get(row.name, None)
274
+
275
+ # Always set embedding, even if None
276
+ row["metadata"]["embedding"] = embedding
277
+
278
+ if info_msg:
279
+ row["metadata"]["info_message_metadata"] = info_msg
280
+ row["document_type"] = ContentTypeEnum.INFO_MSG
281
+ row["_contains_embeddings"] = False
282
+ else:
283
+ row["_contains_embeddings"] = embedding is not None
284
+
285
+ return row
286
+
287
+
288
+ def _add_custom_embeddings(row, embeddings, result_target_field):
289
+ """
290
+ Updates a DataFrame row with embedding data and associated error info
291
+ based on a user supplied custom content field.
292
+
293
+ Parameters
294
+ ----------
295
+ row : pandas.Series
296
+ A row of the DataFrame.
297
+ embeddings : dict
298
+ Dictionary mapping row indices to embeddings.
299
+ result_target_field: str
300
+ The field in custom_content to output the embeddings to
301
+
302
+ Returns
303
+ -------
304
+ pandas.Series
305
+ The updated row
306
+ """
307
+ embedding = embeddings.get(row.name, None)
308
+
309
+ if embedding is not None:
310
+ row["metadata"] = glom.assign(row["metadata"], "custom_content." + result_target_field, embedding, missing=dict)
311
+
312
+ return row
313
+
314
+
315
+ def _format_image_input_string(image_b64: Optional[str]) -> str:
316
+ if not image_b64:
317
+ return
318
+ return f"data:image/png;base64,{image_b64}"
319
+
320
+
321
+ def _format_text_image_pair_input_string(text: Optional[str], image_b64: Optional[str]) -> str:
322
+ if (not text) or (not text.strip()) or (not image_b64):
323
+ return
324
+ return f"{text.strip()} {_format_image_input_string(image_b64)}"
325
+
326
+
327
+ def _get_pandas_text_content(row, modality="text"):
328
+ """
329
+ Extracts text content from a DataFrame row.
330
+
331
+ Parameters
332
+ ----------
333
+ row : pandas.Series
334
+ A row containing the 'content' key.
335
+
336
+ Returns
337
+ -------
338
+ str
339
+ The text content from the row.
340
+ """
341
+ return row["content"]
342
+
343
+
344
+ def _get_pandas_table_content(row, modality="text"):
345
+ """
346
+ Extracts table/chart content from a DataFrame row.
347
+
348
+ Parameters
349
+ ----------
350
+ row : pandas.Series
351
+ A row containing 'table_metadata' with 'table_content'.
352
+
353
+ Returns
354
+ -------
355
+ str
356
+ The table/chart content from the row.
357
+ """
358
+ if modality == "text":
359
+ content = row.get("table_metadata", {}).get("table_content")
360
+ elif modality == "image":
361
+ content = _format_image_input_string(row.get("content"))
362
+ elif modality == "text_image":
363
+ text = row.get("table_metadata", {}).get("table_content")
364
+ image = row.get("content")
365
+ content = _format_text_image_pair_input_string(text, image)
366
+
367
+ return content
368
+
369
+
370
+ def _get_pandas_image_content(row, modality="text"):
371
+ """
372
+ Extracts image caption content from a DataFrame row.
373
+
374
+ Parameters
375
+ ----------
376
+ row : pandas.Series
377
+ A row containing 'image_metadata' with 'caption'.
378
+
379
+ Returns
380
+ -------
381
+ str
382
+ The image caption from the row.
383
+ """
384
+ subtype = row.get("content_metadata", {}).get("subtype")
385
+ if modality == "text":
386
+ if subtype == "page_image":
387
+ content = row.get("image_metadata", {}).get("text")
388
+ else:
389
+ content = row.get("image_metadata", {}).get("caption")
390
+ elif modality == "image":
391
+ content = _format_image_input_string(row.get("content"))
392
+ elif modality == "text_image":
393
+ if subtype == "page_image":
394
+ text = row.get("image_metadata", {}).get("text")
395
+ else:
396
+ text = row.get("image_metadata", {}).get("caption")
397
+ image = row.get("content")
398
+ content = _format_text_image_pair_input_string(text, image)
399
+
400
+ if subtype == "page_image":
401
+ # A workaround to save memory for full page images.
402
+ row["content"] = ""
403
+
404
+ return content
405
+
406
+
407
+ def _get_pandas_audio_content(row, modality="text"):
408
+ """
409
+ A pandas UDF used to select extracted audio transcription to be used to create embeddings.
410
+ """
411
+ return row.get("audio_metadata", {}).get("audio_transcript")
412
+
413
+
414
+ def _get_pandas_custom_content(row, custom_content_field):
415
+ custom_content = row.get("custom_content", {})
416
+ content = glom.glom(custom_content, custom_content_field, default=None)
417
+ if content is None:
418
+ logger.warning(f"Custom content field: {custom_content_field} not found")
419
+ return None
420
+
421
+ try:
422
+ return str(content)
423
+ except (TypeError, ValueError):
424
+ logger.warning(f"Cannot convert custom content field: {custom_content_field} to string")
425
+ return None
426
+
427
+
428
+ # ------------------------------------------------------------------------------
429
+ # Batch Processing Utilities
430
+ # ------------------------------------------------------------------------------
431
+
432
+
433
+ def _batch_generator(iterable: Iterable, batch_size: int = 10):
434
+ """
435
+ Yields batches of a specified size from an iterable.
436
+
437
+ Parameters
438
+ ----------
439
+ iterable : Iterable
440
+ The iterable to batch.
441
+ batch_size : int, optional
442
+ The size of each batch (default is 10).
443
+
444
+ Yields
445
+ ------
446
+ list
447
+ A batch of items from the iterable.
448
+ """
449
+ iter_len = len(iterable)
450
+ for idx in range(0, iter_len, batch_size):
451
+ yield iterable[idx : min(idx + batch_size, iter_len)]
452
+
453
+
454
+ def _generate_batches(prompts: List[str], batch_size: int = 100) -> List[str]:
455
+ """
456
+ Splits a list of prompts into batches.
457
+
458
+ Parameters
459
+ ----------
460
+ prompts : List[str]
461
+ The list of prompt strings.
462
+ batch_size : int, optional
463
+ The desired batch size (default is 100).
464
+
465
+ Returns
466
+ -------
467
+ List[List[str]]
468
+ A list of batches, each containing a subset of the prompts.
469
+ """
470
+ return [batch for batch in _batch_generator(prompts, batch_size)]
471
+
472
+
473
+ # ------------------------------------------------------------------------------
474
+ # DataFrame Concatenation Utility
475
+ # ------------------------------------------------------------------------------
476
+
477
+
478
+ def _concatenate_extractions_pandas(
479
+ base_df: pd.DataFrame, dataframes: List[pd.DataFrame], masks: List[pd.Series]
480
+ ) -> pd.DataFrame:
481
+ """
482
+ Concatenates processed DataFrame rows (with embeddings) with unprocessed rows from the base DataFrame.
483
+
484
+ Parameters
485
+ ----------
486
+ base_df : pd.DataFrame
487
+ The original DataFrame.
488
+ dataframes : List[pd.DataFrame]
489
+ List of DataFrames that have been enriched with embeddings.
490
+ masks : List[pd.Series]
491
+ List of boolean masks indicating the rows that were processed.
492
+
493
+ Returns
494
+ -------
495
+ pd.DataFrame
496
+ The concatenated DataFrame with embeddings applied where available.
497
+ """
498
+ unified_mask = pd.Series(False, index=base_df.index)
499
+ for mask in masks:
500
+ unified_mask = unified_mask | mask
501
+
502
+ df_no_text = base_df.loc[~unified_mask].copy()
503
+ df_no_text["_contains_embeddings"] = False
504
+
505
+ dataframes.append(df_no_text)
506
+ combined_df = pd.concat(dataframes, axis=0, ignore_index=True).reset_index(drop=True)
507
+ return combined_df
508
+
509
+
510
+ # ------------------------------------------------------------------------------
511
+ # Embedding Extraction Pipeline
512
+ # ------------------------------------------------------------------------------
513
+
514
+
515
+ def does_model_support_multimodal_embeddings(model: str) -> bool:
516
+ """
517
+ Checks if a given model supports multi-modal embeddings.
518
+
519
+ Parameters
520
+ ----------
521
+ model : str
522
+ The name of the model.
523
+
524
+ Returns
525
+ -------
526
+ bool
527
+ True if the model supports multi-modal embeddings, False otherwise.
528
+ """
529
+ return model in MULTI_MODAL_MODELS
530
+
531
+
532
+ def transform_create_text_embeddings_internal(
533
+ df_transform_ledger: pd.DataFrame,
534
+ task_config: Dict[str, Any],
535
+ transform_config: TextEmbeddingSchema = TextEmbeddingSchema(),
536
+ execution_trace_log: Optional[Dict] = None,
537
+ ) -> Tuple[pd.DataFrame, Dict]:
538
+ """
539
+ Generates text embeddings for supported content types (TEXT, STRUCTURED, IMAGE, AUDIO)
540
+ from a pandas DataFrame using asynchronous requests.
541
+
542
+ This function ensures that even if the extracted content is empty or None,
543
+ the embedding field is explicitly created and set to None.
544
+
545
+ Parameters
546
+ ----------
547
+ df_transform_ledger : pd.DataFrame
548
+ The DataFrame containing content for embedding extraction.
549
+ task_config : Dict[str, Any]
550
+ Dictionary containing task properties (e.g., filter error flag).
551
+ transform_config : TextEmbeddingSchema, optional
552
+ Validated configuration for text embedding extraction.
553
+ execution_trace_log : Optional[Dict], optional
554
+ Optional trace information for debugging or logging (default is None).
555
+
556
+ Returns
557
+ -------
558
+ Tuple[pd.DataFrame, Dict]
559
+ A tuple containing:
560
+ - The updated DataFrame with embeddings applied.
561
+ - A dictionary with trace information.
562
+ """
563
+ api_key = task_config.get("api_key") or transform_config.api_key
564
+ endpoint_url = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
565
+ model_name = task_config.get("model_name") or transform_config.embedding_model
566
+ custom_content_field = task_config.get("custom_content_field") or transform_config.custom_content_field
567
+ dimensions = task_config.get("dimensions") or transform_config.dimensions
568
+
569
+ if execution_trace_log is None:
570
+ execution_trace_log = {}
571
+ logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
572
+
573
+ if df_transform_ledger.empty:
574
+ return df_transform_ledger, {"trace_info": execution_trace_log}
575
+
576
+ embedding_dataframes = []
577
+ content_masks = []
578
+
579
+ pandas_content_extractor = {
580
+ ContentTypeEnum.TEXT: _get_pandas_text_content,
581
+ ContentTypeEnum.STRUCTURED: _get_pandas_table_content,
582
+ ContentTypeEnum.IMAGE: _get_pandas_image_content,
583
+ ContentTypeEnum.AUDIO: _get_pandas_audio_content,
584
+ ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
585
+ }
586
+ task_type_to_modality = {
587
+ ContentTypeEnum.TEXT: task_config.get("text_elements_modality") or transform_config.text_elements_modality,
588
+ ContentTypeEnum.STRUCTURED: (
589
+ task_config.get("structured_elements_modality") or transform_config.structured_elements_modality
590
+ ),
591
+ ContentTypeEnum.IMAGE: task_config.get("image_elements_modality") or transform_config.image_elements_modality,
592
+ ContentTypeEnum.AUDIO: task_config.get("audio_elements_modality") or transform_config.audio_elements_modality,
593
+ ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
594
+ }
595
+
596
+ def _content_type_getter(row):
597
+ return row["content_metadata"]["type"]
598
+
599
+ for content_type, content_getter in pandas_content_extractor.items():
600
+ if not content_getter:
601
+ logger.warning(f"Skipping text_embedding generation for unsupported content type: {content_type}")
602
+ continue
603
+
604
+ # Get rows matching the content type
605
+ content_mask = df_transform_ledger["metadata"].apply(_content_type_getter) == content_type.value
606
+ if not content_mask.any():
607
+ continue
608
+
609
+ # Always include all content_mask rows and prepare them
610
+ df_content = df_transform_ledger.loc[content_mask].copy().reset_index(drop=True)
611
+
612
+ # Extract content and normalize empty or non-str to None
613
+ extracted_content = (
614
+ df_content["metadata"]
615
+ .apply(partial(content_getter, modality=task_type_to_modality[content_type]))
616
+ .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
617
+ )
618
+ df_content["_content"] = extracted_content
619
+
620
+ # Prepare batches for only valid (non-None) content
621
+ valid_content_mask = df_content["_content"].notna()
622
+ if valid_content_mask.any():
623
+ filtered_content_list = df_content.loc[valid_content_mask, "_content"].tolist()
624
+ filtered_content_batches = _generate_batches(filtered_content_list, batch_size=transform_config.batch_size)
625
+
626
+ if model_name in MULTI_MODAL_MODELS:
627
+ modality_list = [task_type_to_modality[content_type]] * len(filtered_content_list)
628
+ modality_batches = _generate_batches(modality_list, batch_size=transform_config.batch_size)
629
+ else:
630
+ modality_batches = None
631
+
632
+ content_embeddings = _async_runner(
633
+ filtered_content_batches,
634
+ api_key,
635
+ endpoint_url,
636
+ model_name,
637
+ transform_config.encoding_format,
638
+ transform_config.input_type,
639
+ transform_config.truncate,
640
+ False,
641
+ modalities=modality_batches,
642
+ dimensions=dimensions,
643
+ )
644
+ # Build a simple row index -> embedding map
645
+ embeddings_dict = dict(
646
+ zip(df_content.loc[valid_content_mask].index, content_embeddings.get("embeddings", []))
647
+ )
648
+ info_msgs_dict = dict(
649
+ zip(df_content.loc[valid_content_mask].index, content_embeddings.get("info_msgs", []))
650
+ )
651
+ else:
652
+ embeddings_dict = {}
653
+ info_msgs_dict = {}
654
+
655
+ # Apply embeddings or None to all rows
656
+ df_content = df_content.apply(_add_embeddings, embeddings=embeddings_dict, info_msgs=info_msgs_dict, axis=1)
657
+
658
+ embedding_dataframes.append(df_content)
659
+ content_masks.append(content_mask)
660
+
661
+ combined_df = _concatenate_extractions_pandas(df_transform_ledger, embedding_dataframes, content_masks)
662
+
663
+ # Embed custom content
664
+ if custom_content_field is not None:
665
+ result_target_field = task_config.get("result_target_field") or custom_content_field + "_embedding"
666
+
667
+ extracted_custom_content = (
668
+ combined_df["metadata"]
669
+ .apply(partial(_get_pandas_custom_content, custom_content_field=custom_content_field))
670
+ .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
671
+ )
672
+
673
+ valid_custom_content_mask = extracted_custom_content.notna()
674
+ if valid_custom_content_mask.any():
675
+ custom_content_list = extracted_custom_content[valid_custom_content_mask].to_list()
676
+ custom_content_batches = _generate_batches(custom_content_list, batch_size=transform_config.batch_size)
677
+
678
+ custom_content_embeddings = _async_runner(
679
+ custom_content_batches,
680
+ api_key,
681
+ endpoint_url,
682
+ model_name,
683
+ transform_config.encoding_format,
684
+ transform_config.input_type,
685
+ transform_config.truncate,
686
+ False,
687
+ dimensions=dimensions,
688
+ )
689
+ custom_embeddings_dict = dict(
690
+ zip(
691
+ extracted_custom_content.loc[valid_custom_content_mask].index,
692
+ custom_content_embeddings.get("embeddings", []),
693
+ )
694
+ )
695
+ else:
696
+ custom_embeddings_dict = {}
697
+
698
+ combined_df = combined_df.apply(
699
+ _add_custom_embeddings, embeddings=custom_embeddings_dict, result_target_field=result_target_field, axis=1
700
+ )
701
+
702
+ return combined_df, {"trace_info": execution_trace_log}