nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,702 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from functools import partial
|
|
8
|
+
from typing import Any, Dict, Tuple, Optional, Iterable, List
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
|
|
11
|
+
import glom
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
15
|
+
from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
|
|
16
|
+
from nv_ingest_api.util.nim import infer_microservice
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
# Reduce SDK HTTP logging verbosity so request/response logs are not emitted
|
|
22
|
+
logging.getLogger("httpx").setLevel(logging.ERROR)
|
|
23
|
+
logging.getLogger("httpcore").setLevel(logging.ERROR)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
MULTI_MODAL_MODELS = ["llama-3.2-nemoretriever-1b-vlm-embed-v1"]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ------------------------------------------------------------------------------
|
|
30
|
+
# Asynchronous Embedding Requests
|
|
31
|
+
# ------------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _make_async_request(
|
|
35
|
+
prompts: List[str],
|
|
36
|
+
api_key: str,
|
|
37
|
+
embedding_nim_endpoint: str,
|
|
38
|
+
embedding_model: str,
|
|
39
|
+
encoding_format: str,
|
|
40
|
+
input_type: str,
|
|
41
|
+
truncate: str,
|
|
42
|
+
filter_errors: bool,
|
|
43
|
+
modalities: Optional[List[str]] = None,
|
|
44
|
+
dimensions: Optional[int] = None,
|
|
45
|
+
) -> list:
|
|
46
|
+
"""
|
|
47
|
+
Interacts directly with the NIM embedding service to calculate embeddings for a batch of prompts.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
prompts : List[str]
|
|
52
|
+
A list of prompt strings for which embeddings are to be calculated.
|
|
53
|
+
api_key : str
|
|
54
|
+
API key for authentication with the embedding service.
|
|
55
|
+
embedding_nim_endpoint : str
|
|
56
|
+
Base URL for the NIM embedding service.
|
|
57
|
+
embedding_model : str
|
|
58
|
+
The model to use for generating embeddings.
|
|
59
|
+
encoding_format : str
|
|
60
|
+
The desired encoding format.
|
|
61
|
+
input_type : str
|
|
62
|
+
The type of input data.
|
|
63
|
+
truncate : str
|
|
64
|
+
Truncation setting for the input data.
|
|
65
|
+
filter_errors : bool
|
|
66
|
+
Flag indicating whether to filter errors in the response.
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
list
|
|
71
|
+
A dictionary with keys "embedding" (the embedding results) and "info_msg" (any error info).
|
|
72
|
+
|
|
73
|
+
Raises
|
|
74
|
+
------
|
|
75
|
+
RuntimeError
|
|
76
|
+
If an error occurs during the embedding request, with an info message attached.
|
|
77
|
+
"""
|
|
78
|
+
response = {}
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
# Normalize API key to avoid sending an empty bearer token via SDK internals
|
|
82
|
+
_token = (api_key or "").strip()
|
|
83
|
+
_api_key = _token if _token else "<no key provided>"
|
|
84
|
+
|
|
85
|
+
resp = infer_microservice(
|
|
86
|
+
prompts,
|
|
87
|
+
embedding_model,
|
|
88
|
+
embedding_endpoint=embedding_nim_endpoint,
|
|
89
|
+
nvidia_api_key=_api_key,
|
|
90
|
+
input_type=input_type,
|
|
91
|
+
truncate=truncate,
|
|
92
|
+
batch_size=8191,
|
|
93
|
+
grpc="http" not in urlparse(embedding_nim_endpoint).scheme,
|
|
94
|
+
input_names=["text"],
|
|
95
|
+
output_names=["embeddings"],
|
|
96
|
+
dtypes=["BYTES"],
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
response["embedding"] = resp
|
|
100
|
+
response["info_msg"] = None
|
|
101
|
+
|
|
102
|
+
except Exception as err:
|
|
103
|
+
# Truncate error message to prevent memory blowup from large text content
|
|
104
|
+
err_str = str(err)
|
|
105
|
+
if len(err_str) > 500:
|
|
106
|
+
truncated_err = err_str[:200] + "... [truncated to prevent memory blowup] ..." + err_str[-100:]
|
|
107
|
+
else:
|
|
108
|
+
truncated_err = err_str
|
|
109
|
+
|
|
110
|
+
raise RuntimeError(f"Embedding error occurred: {truncated_err}") from err
|
|
111
|
+
|
|
112
|
+
return response
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _async_request_handler(
|
|
116
|
+
prompts: List[str],
|
|
117
|
+
api_key: str,
|
|
118
|
+
embedding_nim_endpoint: str,
|
|
119
|
+
embedding_model: str,
|
|
120
|
+
encoding_format: str,
|
|
121
|
+
input_type: str,
|
|
122
|
+
truncate: str,
|
|
123
|
+
filter_errors: bool,
|
|
124
|
+
modalities: Optional[List[str]] = None,
|
|
125
|
+
dimensions: Optional[int] = None,
|
|
126
|
+
) -> List[dict]:
|
|
127
|
+
"""
|
|
128
|
+
Gathers calculated embedding results from the NIM embedding service concurrently.
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
----------
|
|
132
|
+
prompts : List[str]
|
|
133
|
+
A list of prompt batches.
|
|
134
|
+
api_key : str
|
|
135
|
+
API key for authentication.
|
|
136
|
+
embedding_nim_endpoint : str
|
|
137
|
+
Base URL for the NIM embedding service.
|
|
138
|
+
embedding_model : str
|
|
139
|
+
The model to use for generating embeddings.
|
|
140
|
+
encoding_format : str
|
|
141
|
+
The desired encoding format.
|
|
142
|
+
input_type : str
|
|
143
|
+
The type of input data.
|
|
144
|
+
truncate : str
|
|
145
|
+
Truncation setting for the input data.
|
|
146
|
+
filter_errors : bool
|
|
147
|
+
Flag indicating whether to filter errors in the response.
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
List[dict]
|
|
152
|
+
A list of response dictionaries from the embedding service.
|
|
153
|
+
"""
|
|
154
|
+
if modalities is None:
|
|
155
|
+
modalities = [None] * len(prompts)
|
|
156
|
+
|
|
157
|
+
with ThreadPoolExecutor() as executor:
|
|
158
|
+
futures = [
|
|
159
|
+
executor.submit(
|
|
160
|
+
_make_async_request,
|
|
161
|
+
prompts=prompt_batch,
|
|
162
|
+
api_key=api_key,
|
|
163
|
+
embedding_nim_endpoint=embedding_nim_endpoint,
|
|
164
|
+
embedding_model=embedding_model,
|
|
165
|
+
encoding_format=encoding_format,
|
|
166
|
+
input_type=input_type,
|
|
167
|
+
truncate=truncate,
|
|
168
|
+
filter_errors=filter_errors,
|
|
169
|
+
modalities=modality_batch,
|
|
170
|
+
dimensions=dimensions,
|
|
171
|
+
)
|
|
172
|
+
for prompt_batch, modality_batch in zip(prompts, modalities)
|
|
173
|
+
]
|
|
174
|
+
results = [future.result() for future in futures]
|
|
175
|
+
|
|
176
|
+
return results
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _async_runner(
|
|
180
|
+
prompts: List[str],
|
|
181
|
+
api_key: str,
|
|
182
|
+
embedding_nim_endpoint: str,
|
|
183
|
+
embedding_model: str,
|
|
184
|
+
encoding_format: str,
|
|
185
|
+
input_type: str,
|
|
186
|
+
truncate: str,
|
|
187
|
+
filter_errors: bool,
|
|
188
|
+
modalities: Optional[List[str]] = None,
|
|
189
|
+
dimensions: Optional[int] = None,
|
|
190
|
+
) -> dict:
|
|
191
|
+
"""
|
|
192
|
+
Concurrently launches all NIM embedding requests and flattens the results.
|
|
193
|
+
|
|
194
|
+
Parameters
|
|
195
|
+
----------
|
|
196
|
+
prompts : List[str]
|
|
197
|
+
A list of prompt batches.
|
|
198
|
+
api_key : str
|
|
199
|
+
API key for authentication.
|
|
200
|
+
embedding_nim_endpoint : str
|
|
201
|
+
Base URL for the NIM embedding service.
|
|
202
|
+
embedding_model : str
|
|
203
|
+
The model to use for generating embeddings.
|
|
204
|
+
encoding_format : str
|
|
205
|
+
The desired encoding format.
|
|
206
|
+
input_type : str
|
|
207
|
+
The type of input data.
|
|
208
|
+
truncate : str
|
|
209
|
+
Truncation setting for the input data.
|
|
210
|
+
filter_errors : bool
|
|
211
|
+
Flag indicating whether to filter errors in the response.
|
|
212
|
+
|
|
213
|
+
Returns
|
|
214
|
+
-------
|
|
215
|
+
dict
|
|
216
|
+
A dictionary with keys "embeddings" (flattened embedding results) and "info_msgs" (error messages).
|
|
217
|
+
"""
|
|
218
|
+
results = _async_request_handler(
|
|
219
|
+
prompts,
|
|
220
|
+
api_key,
|
|
221
|
+
embedding_nim_endpoint,
|
|
222
|
+
embedding_model,
|
|
223
|
+
encoding_format,
|
|
224
|
+
input_type,
|
|
225
|
+
truncate,
|
|
226
|
+
filter_errors,
|
|
227
|
+
modalities=modalities,
|
|
228
|
+
dimensions=dimensions,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
flat_results = {"embeddings": [], "info_msgs": []}
|
|
232
|
+
for batch_dict in results:
|
|
233
|
+
info_msg = batch_dict["info_msg"]
|
|
234
|
+
for embedding in batch_dict["embedding"]:
|
|
235
|
+
if not isinstance(embedding, list):
|
|
236
|
+
if embedding is not None:
|
|
237
|
+
flat_results["embeddings"].append(embedding.embedding)
|
|
238
|
+
else:
|
|
239
|
+
flat_results["embeddings"].append(embedding)
|
|
240
|
+
else:
|
|
241
|
+
flat_results["embeddings"].append(embedding)
|
|
242
|
+
flat_results["info_msgs"].append(info_msg)
|
|
243
|
+
|
|
244
|
+
return flat_results
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# ------------------------------------------------------------------------------
|
|
248
|
+
# Pandas UDFs for Content Extraction
|
|
249
|
+
# ------------------------------------------------------------------------------
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _add_embeddings(row, embeddings, info_msgs):
|
|
253
|
+
"""
|
|
254
|
+
Updates a DataFrame row with embedding data and associated error info.
|
|
255
|
+
Ensures the 'embedding' field is always present, even if None.
|
|
256
|
+
|
|
257
|
+
Parameters
|
|
258
|
+
----------
|
|
259
|
+
row : pandas.Series
|
|
260
|
+
A row of the DataFrame.
|
|
261
|
+
embeddings : dict
|
|
262
|
+
Dictionary mapping row indices to embeddings.
|
|
263
|
+
info_msgs : dict
|
|
264
|
+
Dictionary mapping row indices to info message dicts.
|
|
265
|
+
|
|
266
|
+
Returns
|
|
267
|
+
-------
|
|
268
|
+
pandas.Series
|
|
269
|
+
The updated row with 'embedding', 'info_message_metadata', and
|
|
270
|
+
'_contains_embeddings' appropriately set.
|
|
271
|
+
"""
|
|
272
|
+
embedding = embeddings.get(row.name, None)
|
|
273
|
+
info_msg = info_msgs.get(row.name, None)
|
|
274
|
+
|
|
275
|
+
# Always set embedding, even if None
|
|
276
|
+
row["metadata"]["embedding"] = embedding
|
|
277
|
+
|
|
278
|
+
if info_msg:
|
|
279
|
+
row["metadata"]["info_message_metadata"] = info_msg
|
|
280
|
+
row["document_type"] = ContentTypeEnum.INFO_MSG
|
|
281
|
+
row["_contains_embeddings"] = False
|
|
282
|
+
else:
|
|
283
|
+
row["_contains_embeddings"] = embedding is not None
|
|
284
|
+
|
|
285
|
+
return row
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _add_custom_embeddings(row, embeddings, result_target_field):
|
|
289
|
+
"""
|
|
290
|
+
Updates a DataFrame row with embedding data and associated error info
|
|
291
|
+
based on a user supplied custom content field.
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
row : pandas.Series
|
|
296
|
+
A row of the DataFrame.
|
|
297
|
+
embeddings : dict
|
|
298
|
+
Dictionary mapping row indices to embeddings.
|
|
299
|
+
result_target_field: str
|
|
300
|
+
The field in custom_content to output the embeddings to
|
|
301
|
+
|
|
302
|
+
Returns
|
|
303
|
+
-------
|
|
304
|
+
pandas.Series
|
|
305
|
+
The updated row
|
|
306
|
+
"""
|
|
307
|
+
embedding = embeddings.get(row.name, None)
|
|
308
|
+
|
|
309
|
+
if embedding is not None:
|
|
310
|
+
row["metadata"] = glom.assign(row["metadata"], "custom_content." + result_target_field, embedding, missing=dict)
|
|
311
|
+
|
|
312
|
+
return row
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _format_image_input_string(image_b64: Optional[str]) -> str:
|
|
316
|
+
if not image_b64:
|
|
317
|
+
return
|
|
318
|
+
return f"data:image/png;base64,{image_b64}"
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _format_text_image_pair_input_string(text: Optional[str], image_b64: Optional[str]) -> str:
|
|
322
|
+
if (not text) or (not text.strip()) or (not image_b64):
|
|
323
|
+
return
|
|
324
|
+
return f"{text.strip()} {_format_image_input_string(image_b64)}"
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _get_pandas_text_content(row, modality="text"):
|
|
328
|
+
"""
|
|
329
|
+
Extracts text content from a DataFrame row.
|
|
330
|
+
|
|
331
|
+
Parameters
|
|
332
|
+
----------
|
|
333
|
+
row : pandas.Series
|
|
334
|
+
A row containing the 'content' key.
|
|
335
|
+
|
|
336
|
+
Returns
|
|
337
|
+
-------
|
|
338
|
+
str
|
|
339
|
+
The text content from the row.
|
|
340
|
+
"""
|
|
341
|
+
return row["content"]
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _get_pandas_table_content(row, modality="text"):
|
|
345
|
+
"""
|
|
346
|
+
Extracts table/chart content from a DataFrame row.
|
|
347
|
+
|
|
348
|
+
Parameters
|
|
349
|
+
----------
|
|
350
|
+
row : pandas.Series
|
|
351
|
+
A row containing 'table_metadata' with 'table_content'.
|
|
352
|
+
|
|
353
|
+
Returns
|
|
354
|
+
-------
|
|
355
|
+
str
|
|
356
|
+
The table/chart content from the row.
|
|
357
|
+
"""
|
|
358
|
+
if modality == "text":
|
|
359
|
+
content = row.get("table_metadata", {}).get("table_content")
|
|
360
|
+
elif modality == "image":
|
|
361
|
+
content = _format_image_input_string(row.get("content"))
|
|
362
|
+
elif modality == "text_image":
|
|
363
|
+
text = row.get("table_metadata", {}).get("table_content")
|
|
364
|
+
image = row.get("content")
|
|
365
|
+
content = _format_text_image_pair_input_string(text, image)
|
|
366
|
+
|
|
367
|
+
return content
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _get_pandas_image_content(row, modality="text"):
|
|
371
|
+
"""
|
|
372
|
+
Extracts image caption content from a DataFrame row.
|
|
373
|
+
|
|
374
|
+
Parameters
|
|
375
|
+
----------
|
|
376
|
+
row : pandas.Series
|
|
377
|
+
A row containing 'image_metadata' with 'caption'.
|
|
378
|
+
|
|
379
|
+
Returns
|
|
380
|
+
-------
|
|
381
|
+
str
|
|
382
|
+
The image caption from the row.
|
|
383
|
+
"""
|
|
384
|
+
subtype = row.get("content_metadata", {}).get("subtype")
|
|
385
|
+
if modality == "text":
|
|
386
|
+
if subtype == "page_image":
|
|
387
|
+
content = row.get("image_metadata", {}).get("text")
|
|
388
|
+
else:
|
|
389
|
+
content = row.get("image_metadata", {}).get("caption")
|
|
390
|
+
elif modality == "image":
|
|
391
|
+
content = _format_image_input_string(row.get("content"))
|
|
392
|
+
elif modality == "text_image":
|
|
393
|
+
if subtype == "page_image":
|
|
394
|
+
text = row.get("image_metadata", {}).get("text")
|
|
395
|
+
else:
|
|
396
|
+
text = row.get("image_metadata", {}).get("caption")
|
|
397
|
+
image = row.get("content")
|
|
398
|
+
content = _format_text_image_pair_input_string(text, image)
|
|
399
|
+
|
|
400
|
+
if subtype == "page_image":
|
|
401
|
+
# A workaround to save memory for full page images.
|
|
402
|
+
row["content"] = ""
|
|
403
|
+
|
|
404
|
+
return content
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _get_pandas_audio_content(row, modality="text"):
|
|
408
|
+
"""
|
|
409
|
+
A pandas UDF used to select extracted audio transcription to be used to create embeddings.
|
|
410
|
+
"""
|
|
411
|
+
return row.get("audio_metadata", {}).get("audio_transcript")
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _get_pandas_custom_content(row, custom_content_field):
|
|
415
|
+
custom_content = row.get("custom_content", {})
|
|
416
|
+
content = glom.glom(custom_content, custom_content_field, default=None)
|
|
417
|
+
if content is None:
|
|
418
|
+
logger.warning(f"Custom content field: {custom_content_field} not found")
|
|
419
|
+
return None
|
|
420
|
+
|
|
421
|
+
try:
|
|
422
|
+
return str(content)
|
|
423
|
+
except (TypeError, ValueError):
|
|
424
|
+
logger.warning(f"Cannot convert custom content field: {custom_content_field} to string")
|
|
425
|
+
return None
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
# ------------------------------------------------------------------------------
|
|
429
|
+
# Batch Processing Utilities
|
|
430
|
+
# ------------------------------------------------------------------------------
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def _batch_generator(iterable: Iterable, batch_size: int = 10):
|
|
434
|
+
"""
|
|
435
|
+
Yields batches of a specified size from an iterable.
|
|
436
|
+
|
|
437
|
+
Parameters
|
|
438
|
+
----------
|
|
439
|
+
iterable : Iterable
|
|
440
|
+
The iterable to batch.
|
|
441
|
+
batch_size : int, optional
|
|
442
|
+
The size of each batch (default is 10).
|
|
443
|
+
|
|
444
|
+
Yields
|
|
445
|
+
------
|
|
446
|
+
list
|
|
447
|
+
A batch of items from the iterable.
|
|
448
|
+
"""
|
|
449
|
+
iter_len = len(iterable)
|
|
450
|
+
for idx in range(0, iter_len, batch_size):
|
|
451
|
+
yield iterable[idx : min(idx + batch_size, iter_len)]
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def _generate_batches(prompts: List[str], batch_size: int = 100) -> List[str]:
|
|
455
|
+
"""
|
|
456
|
+
Splits a list of prompts into batches.
|
|
457
|
+
|
|
458
|
+
Parameters
|
|
459
|
+
----------
|
|
460
|
+
prompts : List[str]
|
|
461
|
+
The list of prompt strings.
|
|
462
|
+
batch_size : int, optional
|
|
463
|
+
The desired batch size (default is 100).
|
|
464
|
+
|
|
465
|
+
Returns
|
|
466
|
+
-------
|
|
467
|
+
List[List[str]]
|
|
468
|
+
A list of batches, each containing a subset of the prompts.
|
|
469
|
+
"""
|
|
470
|
+
return [batch for batch in _batch_generator(prompts, batch_size)]
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
# ------------------------------------------------------------------------------
|
|
474
|
+
# DataFrame Concatenation Utility
|
|
475
|
+
# ------------------------------------------------------------------------------
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def _concatenate_extractions_pandas(
|
|
479
|
+
base_df: pd.DataFrame, dataframes: List[pd.DataFrame], masks: List[pd.Series]
|
|
480
|
+
) -> pd.DataFrame:
|
|
481
|
+
"""
|
|
482
|
+
Concatenates processed DataFrame rows (with embeddings) with unprocessed rows from the base DataFrame.
|
|
483
|
+
|
|
484
|
+
Parameters
|
|
485
|
+
----------
|
|
486
|
+
base_df : pd.DataFrame
|
|
487
|
+
The original DataFrame.
|
|
488
|
+
dataframes : List[pd.DataFrame]
|
|
489
|
+
List of DataFrames that have been enriched with embeddings.
|
|
490
|
+
masks : List[pd.Series]
|
|
491
|
+
List of boolean masks indicating the rows that were processed.
|
|
492
|
+
|
|
493
|
+
Returns
|
|
494
|
+
-------
|
|
495
|
+
pd.DataFrame
|
|
496
|
+
The concatenated DataFrame with embeddings applied where available.
|
|
497
|
+
"""
|
|
498
|
+
unified_mask = pd.Series(False, index=base_df.index)
|
|
499
|
+
for mask in masks:
|
|
500
|
+
unified_mask = unified_mask | mask
|
|
501
|
+
|
|
502
|
+
df_no_text = base_df.loc[~unified_mask].copy()
|
|
503
|
+
df_no_text["_contains_embeddings"] = False
|
|
504
|
+
|
|
505
|
+
dataframes.append(df_no_text)
|
|
506
|
+
combined_df = pd.concat(dataframes, axis=0, ignore_index=True).reset_index(drop=True)
|
|
507
|
+
return combined_df
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
# ------------------------------------------------------------------------------
|
|
511
|
+
# Embedding Extraction Pipeline
|
|
512
|
+
# ------------------------------------------------------------------------------
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
def does_model_support_multimodal_embeddings(model: str) -> bool:
|
|
516
|
+
"""
|
|
517
|
+
Checks if a given model supports multi-modal embeddings.
|
|
518
|
+
|
|
519
|
+
Parameters
|
|
520
|
+
----------
|
|
521
|
+
model : str
|
|
522
|
+
The name of the model.
|
|
523
|
+
|
|
524
|
+
Returns
|
|
525
|
+
-------
|
|
526
|
+
bool
|
|
527
|
+
True if the model supports multi-modal embeddings, False otherwise.
|
|
528
|
+
"""
|
|
529
|
+
return model in MULTI_MODAL_MODELS
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def transform_create_text_embeddings_internal(
|
|
533
|
+
df_transform_ledger: pd.DataFrame,
|
|
534
|
+
task_config: Dict[str, Any],
|
|
535
|
+
transform_config: TextEmbeddingSchema = TextEmbeddingSchema(),
|
|
536
|
+
execution_trace_log: Optional[Dict] = None,
|
|
537
|
+
) -> Tuple[pd.DataFrame, Dict]:
|
|
538
|
+
"""
|
|
539
|
+
Generates text embeddings for supported content types (TEXT, STRUCTURED, IMAGE, AUDIO)
|
|
540
|
+
from a pandas DataFrame using asynchronous requests.
|
|
541
|
+
|
|
542
|
+
This function ensures that even if the extracted content is empty or None,
|
|
543
|
+
the embedding field is explicitly created and set to None.
|
|
544
|
+
|
|
545
|
+
Parameters
|
|
546
|
+
----------
|
|
547
|
+
df_transform_ledger : pd.DataFrame
|
|
548
|
+
The DataFrame containing content for embedding extraction.
|
|
549
|
+
task_config : Dict[str, Any]
|
|
550
|
+
Dictionary containing task properties (e.g., filter error flag).
|
|
551
|
+
transform_config : TextEmbeddingSchema, optional
|
|
552
|
+
Validated configuration for text embedding extraction.
|
|
553
|
+
execution_trace_log : Optional[Dict], optional
|
|
554
|
+
Optional trace information for debugging or logging (default is None).
|
|
555
|
+
|
|
556
|
+
Returns
|
|
557
|
+
-------
|
|
558
|
+
Tuple[pd.DataFrame, Dict]
|
|
559
|
+
A tuple containing:
|
|
560
|
+
- The updated DataFrame with embeddings applied.
|
|
561
|
+
- A dictionary with trace information.
|
|
562
|
+
"""
|
|
563
|
+
api_key = task_config.get("api_key") or transform_config.api_key
|
|
564
|
+
endpoint_url = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
|
|
565
|
+
model_name = task_config.get("model_name") or transform_config.embedding_model
|
|
566
|
+
custom_content_field = task_config.get("custom_content_field") or transform_config.custom_content_field
|
|
567
|
+
dimensions = task_config.get("dimensions") or transform_config.dimensions
|
|
568
|
+
|
|
569
|
+
if execution_trace_log is None:
|
|
570
|
+
execution_trace_log = {}
|
|
571
|
+
logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
|
|
572
|
+
|
|
573
|
+
if df_transform_ledger.empty:
|
|
574
|
+
return df_transform_ledger, {"trace_info": execution_trace_log}
|
|
575
|
+
|
|
576
|
+
embedding_dataframes = []
|
|
577
|
+
content_masks = []
|
|
578
|
+
|
|
579
|
+
pandas_content_extractor = {
|
|
580
|
+
ContentTypeEnum.TEXT: _get_pandas_text_content,
|
|
581
|
+
ContentTypeEnum.STRUCTURED: _get_pandas_table_content,
|
|
582
|
+
ContentTypeEnum.IMAGE: _get_pandas_image_content,
|
|
583
|
+
ContentTypeEnum.AUDIO: _get_pandas_audio_content,
|
|
584
|
+
ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
|
|
585
|
+
}
|
|
586
|
+
task_type_to_modality = {
|
|
587
|
+
ContentTypeEnum.TEXT: task_config.get("text_elements_modality") or transform_config.text_elements_modality,
|
|
588
|
+
ContentTypeEnum.STRUCTURED: (
|
|
589
|
+
task_config.get("structured_elements_modality") or transform_config.structured_elements_modality
|
|
590
|
+
),
|
|
591
|
+
ContentTypeEnum.IMAGE: task_config.get("image_elements_modality") or transform_config.image_elements_modality,
|
|
592
|
+
ContentTypeEnum.AUDIO: task_config.get("audio_elements_modality") or transform_config.audio_elements_modality,
|
|
593
|
+
ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
def _content_type_getter(row):
|
|
597
|
+
return row["content_metadata"]["type"]
|
|
598
|
+
|
|
599
|
+
for content_type, content_getter in pandas_content_extractor.items():
|
|
600
|
+
if not content_getter:
|
|
601
|
+
logger.warning(f"Skipping text_embedding generation for unsupported content type: {content_type}")
|
|
602
|
+
continue
|
|
603
|
+
|
|
604
|
+
# Get rows matching the content type
|
|
605
|
+
content_mask = df_transform_ledger["metadata"].apply(_content_type_getter) == content_type.value
|
|
606
|
+
if not content_mask.any():
|
|
607
|
+
continue
|
|
608
|
+
|
|
609
|
+
# Always include all content_mask rows and prepare them
|
|
610
|
+
df_content = df_transform_ledger.loc[content_mask].copy().reset_index(drop=True)
|
|
611
|
+
|
|
612
|
+
# Extract content and normalize empty or non-str to None
|
|
613
|
+
extracted_content = (
|
|
614
|
+
df_content["metadata"]
|
|
615
|
+
.apply(partial(content_getter, modality=task_type_to_modality[content_type]))
|
|
616
|
+
.apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
|
|
617
|
+
)
|
|
618
|
+
df_content["_content"] = extracted_content
|
|
619
|
+
|
|
620
|
+
# Prepare batches for only valid (non-None) content
|
|
621
|
+
valid_content_mask = df_content["_content"].notna()
|
|
622
|
+
if valid_content_mask.any():
|
|
623
|
+
filtered_content_list = df_content.loc[valid_content_mask, "_content"].tolist()
|
|
624
|
+
filtered_content_batches = _generate_batches(filtered_content_list, batch_size=transform_config.batch_size)
|
|
625
|
+
|
|
626
|
+
if model_name in MULTI_MODAL_MODELS:
|
|
627
|
+
modality_list = [task_type_to_modality[content_type]] * len(filtered_content_list)
|
|
628
|
+
modality_batches = _generate_batches(modality_list, batch_size=transform_config.batch_size)
|
|
629
|
+
else:
|
|
630
|
+
modality_batches = None
|
|
631
|
+
|
|
632
|
+
content_embeddings = _async_runner(
|
|
633
|
+
filtered_content_batches,
|
|
634
|
+
api_key,
|
|
635
|
+
endpoint_url,
|
|
636
|
+
model_name,
|
|
637
|
+
transform_config.encoding_format,
|
|
638
|
+
transform_config.input_type,
|
|
639
|
+
transform_config.truncate,
|
|
640
|
+
False,
|
|
641
|
+
modalities=modality_batches,
|
|
642
|
+
dimensions=dimensions,
|
|
643
|
+
)
|
|
644
|
+
# Build a simple row index -> embedding map
|
|
645
|
+
embeddings_dict = dict(
|
|
646
|
+
zip(df_content.loc[valid_content_mask].index, content_embeddings.get("embeddings", []))
|
|
647
|
+
)
|
|
648
|
+
info_msgs_dict = dict(
|
|
649
|
+
zip(df_content.loc[valid_content_mask].index, content_embeddings.get("info_msgs", []))
|
|
650
|
+
)
|
|
651
|
+
else:
|
|
652
|
+
embeddings_dict = {}
|
|
653
|
+
info_msgs_dict = {}
|
|
654
|
+
|
|
655
|
+
# Apply embeddings or None to all rows
|
|
656
|
+
df_content = df_content.apply(_add_embeddings, embeddings=embeddings_dict, info_msgs=info_msgs_dict, axis=1)
|
|
657
|
+
|
|
658
|
+
embedding_dataframes.append(df_content)
|
|
659
|
+
content_masks.append(content_mask)
|
|
660
|
+
|
|
661
|
+
combined_df = _concatenate_extractions_pandas(df_transform_ledger, embedding_dataframes, content_masks)
|
|
662
|
+
|
|
663
|
+
# Embed custom content
|
|
664
|
+
if custom_content_field is not None:
|
|
665
|
+
result_target_field = task_config.get("result_target_field") or custom_content_field + "_embedding"
|
|
666
|
+
|
|
667
|
+
extracted_custom_content = (
|
|
668
|
+
combined_df["metadata"]
|
|
669
|
+
.apply(partial(_get_pandas_custom_content, custom_content_field=custom_content_field))
|
|
670
|
+
.apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
valid_custom_content_mask = extracted_custom_content.notna()
|
|
674
|
+
if valid_custom_content_mask.any():
|
|
675
|
+
custom_content_list = extracted_custom_content[valid_custom_content_mask].to_list()
|
|
676
|
+
custom_content_batches = _generate_batches(custom_content_list, batch_size=transform_config.batch_size)
|
|
677
|
+
|
|
678
|
+
custom_content_embeddings = _async_runner(
|
|
679
|
+
custom_content_batches,
|
|
680
|
+
api_key,
|
|
681
|
+
endpoint_url,
|
|
682
|
+
model_name,
|
|
683
|
+
transform_config.encoding_format,
|
|
684
|
+
transform_config.input_type,
|
|
685
|
+
transform_config.truncate,
|
|
686
|
+
False,
|
|
687
|
+
dimensions=dimensions,
|
|
688
|
+
)
|
|
689
|
+
custom_embeddings_dict = dict(
|
|
690
|
+
zip(
|
|
691
|
+
extracted_custom_content.loc[valid_custom_content_mask].index,
|
|
692
|
+
custom_content_embeddings.get("embeddings", []),
|
|
693
|
+
)
|
|
694
|
+
)
|
|
695
|
+
else:
|
|
696
|
+
custom_embeddings_dict = {}
|
|
697
|
+
|
|
698
|
+
combined_df = combined_df.apply(
|
|
699
|
+
_add_custom_embeddings, embeddings=custom_embeddings_dict, result_target_field=result_target_field, axis=1
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
return combined_df, {"trace_info": execution_trace_log}
|