nv-ingest-api 2025.4.17.dev20250417__py3-none-any.whl → 2025.4.19.dev20250419__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +0 -3
- nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
- nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
- {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.19.dev20250419.dist-info/RECORD +9 -0
- {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/WHEEL +1 -1
- nv_ingest_api/interface/__init__.py +0 -215
- nv_ingest_api/interface/extract.py +0 -972
- nv_ingest_api/interface/mutate.py +0 -154
- nv_ingest_api/interface/store.py +0 -218
- nv_ingest_api/interface/transform.py +0 -382
- nv_ingest_api/interface/utility.py +0 -200
- nv_ingest_api/internal/enums/__init__.py +0 -3
- nv_ingest_api/internal/enums/common.py +0 -494
- nv_ingest_api/internal/extract/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
- nv_ingest_api/internal/extract/docx/__init__.py +0 -5
- nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
- nv_ingest_api/internal/extract/image/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
- nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
- nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
- nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
- nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
- nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
- nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
- nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
- nv_ingest_api/internal/mutate/__init__.py +0 -3
- nv_ingest_api/internal/mutate/deduplicate.py +0 -110
- nv_ingest_api/internal/mutate/filter.py +0 -133
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
- nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
- nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
- nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
- nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
- nv_ingest_api/internal/schemas/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
- nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
- nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
- nv_ingest_api/internal/schemas/store/__init__.py +0 -3
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
- nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
- nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
- nv_ingest_api/internal/store/__init__.py +0 -3
- nv_ingest_api/internal/store/embed_text_upload.py +0 -236
- nv_ingest_api/internal/store/image_upload.py +0 -232
- nv_ingest_api/internal/transform/__init__.py +0 -3
- nv_ingest_api/internal/transform/caption_image.py +0 -205
- nv_ingest_api/internal/transform/embed_text.py +0 -496
- nv_ingest_api/internal/transform/split_text.py +0 -157
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +0 -47
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +0 -78
- nv_ingest_api/util/converters/containers.py +0 -65
- nv_ingest_api/util/converters/datetools.py +0 -90
- nv_ingest_api/util/converters/dftools.py +0 -127
- nv_ingest_api/util/converters/formats.py +0 -64
- nv_ingest_api/util/converters/type_mappings.py +0 -27
- nv_ingest_api/util/detectors/__init__.py +0 -5
- nv_ingest_api/util/detectors/language.py +0 -38
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +0 -72
- nv_ingest_api/util/exception_handlers/decorators.py +0 -223
- nv_ingest_api/util/exception_handlers/detectors.py +0 -74
- nv_ingest_api/util/exception_handlers/pdf.py +0 -116
- nv_ingest_api/util/exception_handlers/schemas.py +0 -68
- nv_ingest_api/util/image_processing/__init__.py +0 -5
- nv_ingest_api/util/image_processing/clustering.py +0 -260
- nv_ingest_api/util/image_processing/processing.py +0 -179
- nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
- nv_ingest_api/util/image_processing/transforms.py +0 -407
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +0 -31
- nv_ingest_api/util/message_brokers/__init__.py +0 -3
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -435
- nv_ingest_api/util/metadata/__init__.py +0 -5
- nv_ingest_api/util/metadata/aggregators.py +0 -469
- nv_ingest_api/util/multi_processing/__init__.py +0 -8
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
- nv_ingest_api/util/nim/__init__.py +0 -56
- nv_ingest_api/util/pdf/__init__.py +0 -3
- nv_ingest_api/util/pdf/pdfium.py +0 -427
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +0 -10
- nv_ingest_api/util/service_clients/__init__.py +0 -3
- nv_ingest_api/util/service_clients/client_base.py +0 -72
- nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +0 -334
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +0 -398
- nv_ingest_api/util/string_processing/__init__.py +0 -51
- nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +0 -152
- /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
- {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/top_level.txt +0 -0
|
@@ -1,205 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
import logging
|
|
6
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
9
|
-
from pydantic import BaseModel
|
|
10
|
-
|
|
11
|
-
from nv_ingest_api.internal.primitives.nim.model_interface.vlm import VLMModelInterface
|
|
12
|
-
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
13
|
-
from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
|
|
14
|
-
from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
|
|
15
|
-
from nv_ingest_api.util.nim import create_inference_client
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger(__name__)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def _prepare_dataframes_mod(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
|
|
21
|
-
"""
|
|
22
|
-
Prepares and returns three DataFrame-related objects from the input DataFrame.
|
|
23
|
-
|
|
24
|
-
The function performs the following:
|
|
25
|
-
1. Checks if the DataFrame is empty or if the "document_type" column is missing.
|
|
26
|
-
In such a case, returns the original DataFrame, an empty DataFrame, and an empty boolean Series.
|
|
27
|
-
2. Otherwise, it creates a boolean Series identifying rows where "document_type" equals IMAGE.
|
|
28
|
-
3. Extracts a DataFrame containing only those rows.
|
|
29
|
-
|
|
30
|
-
Parameters
|
|
31
|
-
----------
|
|
32
|
-
df : pd.DataFrame
|
|
33
|
-
The input DataFrame that should contain a "document_type" column.
|
|
34
|
-
|
|
35
|
-
Returns
|
|
36
|
-
-------
|
|
37
|
-
Tuple[pd.DataFrame, pd.DataFrame, pd.Series]
|
|
38
|
-
A tuple containing:
|
|
39
|
-
- The original DataFrame.
|
|
40
|
-
- A DataFrame filtered to rows where "document_type" is IMAGE.
|
|
41
|
-
- A boolean Series indicating which rows in the original DataFrame are IMAGE rows.
|
|
42
|
-
"""
|
|
43
|
-
try:
|
|
44
|
-
if df.empty or "document_type" not in df.columns:
|
|
45
|
-
return df, pd.DataFrame(), pd.Series(dtype=bool)
|
|
46
|
-
|
|
47
|
-
bool_index: pd.Series = df["document_type"] == ContentTypeEnum.IMAGE
|
|
48
|
-
df_matched: pd.DataFrame = df.loc[bool_index]
|
|
49
|
-
|
|
50
|
-
return df, df_matched, bool_index
|
|
51
|
-
|
|
52
|
-
except Exception as e:
|
|
53
|
-
err_msg = f"_prepare_dataframes_mod: Error preparing dataframes. Original error: {e}"
|
|
54
|
-
logger.error(err_msg, exc_info=True)
|
|
55
|
-
raise type(e)(err_msg) from e
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def _generate_captions(
|
|
59
|
-
base64_images: List[str], prompt: str, api_key: str, endpoint_url: str, model_name: str
|
|
60
|
-
) -> List[str]:
|
|
61
|
-
"""
|
|
62
|
-
Generates captions for a list of base64-encoded PNG images using the VLM model API.
|
|
63
|
-
|
|
64
|
-
This function performs the following steps:
|
|
65
|
-
1. Scales each image to meet encoding size requirements using `scale_image_to_encoding_size`.
|
|
66
|
-
2. Constructs the input payload containing the scaled images and the provided prompt.
|
|
67
|
-
3. Creates an inference client using the VLMModelInterface.
|
|
68
|
-
4. Calls the client's infer method to obtain a list of captions corresponding to the images.
|
|
69
|
-
|
|
70
|
-
Parameters
|
|
71
|
-
----------
|
|
72
|
-
base64_images : List[str]
|
|
73
|
-
List of base64-encoded PNG image strings.
|
|
74
|
-
prompt : str
|
|
75
|
-
Text prompt to guide caption generation.
|
|
76
|
-
api_key : str
|
|
77
|
-
API key for authenticating with the VLM endpoint.
|
|
78
|
-
endpoint_url : str
|
|
79
|
-
URL of the VLM model HTTP endpoint.
|
|
80
|
-
model_name : str
|
|
81
|
-
The name of the model to use for inference.
|
|
82
|
-
|
|
83
|
-
Returns
|
|
84
|
-
-------
|
|
85
|
-
List[str]
|
|
86
|
-
A list of generated captions, each corresponding to an input image.
|
|
87
|
-
|
|
88
|
-
Raises
|
|
89
|
-
------
|
|
90
|
-
Exception
|
|
91
|
-
Propagates any exception encountered during caption generation, with added context.
|
|
92
|
-
"""
|
|
93
|
-
try:
|
|
94
|
-
# Scale each image to ensure it meets encoding size requirements.
|
|
95
|
-
scaled_images: List[str] = [scale_image_to_encoding_size(b64)[0] for b64 in base64_images]
|
|
96
|
-
|
|
97
|
-
# Build the input payload for the VLM model.
|
|
98
|
-
data: Dict[str, Any] = {
|
|
99
|
-
"base64_images": scaled_images,
|
|
100
|
-
"prompt": prompt,
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
# Create the inference client using the VLMModelInterface.
|
|
104
|
-
nim_client = create_inference_client(
|
|
105
|
-
model_interface=VLMModelInterface(),
|
|
106
|
-
endpoints=(None, endpoint_url),
|
|
107
|
-
auth_token=api_key,
|
|
108
|
-
infer_protocol="http",
|
|
109
|
-
)
|
|
110
|
-
|
|
111
|
-
logger.debug(f"Calling VLM endpoint: {endpoint_url} with model: {model_name}")
|
|
112
|
-
# Perform inference to generate captions.
|
|
113
|
-
captions: List[str] = nim_client.infer(data, model_name=model_name)
|
|
114
|
-
return captions
|
|
115
|
-
|
|
116
|
-
except Exception as e:
|
|
117
|
-
err_msg = f"_generate_captions: Error generating captions: {e}"
|
|
118
|
-
logger.error(err_msg, exc_info=True)
|
|
119
|
-
raise type(e)(err_msg) from e
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
@unified_exception_handler
|
|
123
|
-
def transform_image_create_vlm_caption_internal(
|
|
124
|
-
df_transform_ledger: pd.DataFrame,
|
|
125
|
-
task_config: Union[BaseModel, Dict[str, Any]],
|
|
126
|
-
transform_config: Any,
|
|
127
|
-
execution_trace_log: Optional[Dict[str, Any]] = None,
|
|
128
|
-
) -> pd.DataFrame:
|
|
129
|
-
"""
|
|
130
|
-
Extracts and adds captions for image content in a DataFrame using the VLM model API.
|
|
131
|
-
|
|
132
|
-
This function updates the 'metadata' column for rows where the content type is "image".
|
|
133
|
-
It uses configuration values from task_config (or falls back to transform_config defaults)
|
|
134
|
-
to determine the API key, prompt, endpoint URL, and model name for caption generation.
|
|
135
|
-
The generated captions are added under the 'image_metadata.caption' key in the metadata.
|
|
136
|
-
|
|
137
|
-
Parameters
|
|
138
|
-
----------
|
|
139
|
-
df_transform_ledger : pd.DataFrame
|
|
140
|
-
The input DataFrame containing image data. Each row must have a 'metadata' column
|
|
141
|
-
with at least the 'content' and 'content_metadata' keys.
|
|
142
|
-
task_config : Union[BaseModel, Dict[str, Any]]
|
|
143
|
-
Configuration parameters for caption extraction. If provided as a Pydantic model,
|
|
144
|
-
it will be converted to a dictionary. Expected keys include "api_key", "prompt",
|
|
145
|
-
"endpoint_url", and "model_name".
|
|
146
|
-
transform_config : Any
|
|
147
|
-
A configuration object providing default values for caption extraction. It should have
|
|
148
|
-
attributes: api_key, prompt, endpoint_url, and model_name.
|
|
149
|
-
execution_trace_log : Optional[Dict[str, Any]], default=None
|
|
150
|
-
Optional trace information for debugging or logging purposes.
|
|
151
|
-
|
|
152
|
-
Returns
|
|
153
|
-
-------
|
|
154
|
-
pd.DataFrame
|
|
155
|
-
The updated DataFrame with generated captions added to the 'image_metadata.caption' field
|
|
156
|
-
within the 'metadata' column for each image row.
|
|
157
|
-
|
|
158
|
-
Raises
|
|
159
|
-
------
|
|
160
|
-
Exception
|
|
161
|
-
Propagates any exception encountered during the caption extraction process, with added context.
|
|
162
|
-
"""
|
|
163
|
-
|
|
164
|
-
_ = execution_trace_log # Unused variable; placeholder to prevent linter warnings.
|
|
165
|
-
|
|
166
|
-
logger.debug("Attempting to caption image content")
|
|
167
|
-
|
|
168
|
-
# Convert task_config to dictionary if it is a Pydantic model.
|
|
169
|
-
if isinstance(task_config, BaseModel):
|
|
170
|
-
task_config = task_config.model_dump()
|
|
171
|
-
|
|
172
|
-
# Retrieve configuration values with fallback to transform_config defaults.
|
|
173
|
-
api_key: str = task_config.get("api_key") or transform_config.api_key
|
|
174
|
-
prompt: str = task_config.get("prompt") or transform_config.prompt
|
|
175
|
-
endpoint_url: str = task_config.get("endpoint_url") or transform_config.endpoint_url
|
|
176
|
-
model_name: str = task_config.get("model_name") or transform_config.model_name
|
|
177
|
-
|
|
178
|
-
# Create a mask for rows where the content type is "image".
|
|
179
|
-
df_mask: pd.Series = df_transform_ledger["metadata"].apply(
|
|
180
|
-
lambda meta: meta.get("content_metadata", {}).get("type") == "image"
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
# If no image rows exist, return the original DataFrame.
|
|
184
|
-
if not df_mask.any():
|
|
185
|
-
return df_transform_ledger
|
|
186
|
-
|
|
187
|
-
# Collect base64-encoded images from the rows where the content type is "image".
|
|
188
|
-
base64_images: List[str] = df_transform_ledger.loc[df_mask, "metadata"].apply(lambda meta: meta["content"]).tolist()
|
|
189
|
-
|
|
190
|
-
# Generate captions for the collected images.
|
|
191
|
-
captions: List[str] = _generate_captions(base64_images, prompt, api_key, endpoint_url, model_name)
|
|
192
|
-
|
|
193
|
-
# Update the DataFrame: assign each generated caption to the corresponding row.
|
|
194
|
-
for idx, caption in zip(df_transform_ledger.loc[df_mask].index, captions):
|
|
195
|
-
meta: Dict[str, Any] = df_transform_ledger.at[idx, "metadata"]
|
|
196
|
-
image_meta: Dict[str, Any] = meta.get("image_metadata", {})
|
|
197
|
-
image_meta["caption"] = caption
|
|
198
|
-
meta["image_metadata"] = image_meta
|
|
199
|
-
df_transform_ledger.at[idx, "metadata"] = meta
|
|
200
|
-
|
|
201
|
-
logger.debug("Image content captioning complete")
|
|
202
|
-
result, execution_trace_log = df_transform_ledger, {}
|
|
203
|
-
_ = execution_trace_log # Unused variable; placeholder to prevent linter warnings.
|
|
204
|
-
|
|
205
|
-
return result
|
|
@@ -1,496 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
import logging
|
|
6
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
-
from typing import Any, Dict, Tuple, Optional, Iterable, List
|
|
8
|
-
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from openai import OpenAI
|
|
11
|
-
|
|
12
|
-
from nv_ingest_api.internal.enums.common import ContentTypeEnum, StatusEnum, TaskTypeEnum
|
|
13
|
-
from nv_ingest_api.internal.schemas.meta.metadata_schema import (
|
|
14
|
-
InfoMessageMetadataSchema,
|
|
15
|
-
)
|
|
16
|
-
from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
|
|
17
|
-
from nv_ingest_api.util.schema.schema_validator import validate_schema
|
|
18
|
-
|
|
19
|
-
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
# ------------------------------------------------------------------------------
|
|
23
|
-
# Asynchronous Embedding Requests
|
|
24
|
-
# ------------------------------------------------------------------------------
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def _make_async_request(
|
|
28
|
-
prompts: List[str],
|
|
29
|
-
api_key: str,
|
|
30
|
-
embedding_nim_endpoint: str,
|
|
31
|
-
embedding_model: str,
|
|
32
|
-
encoding_format: str,
|
|
33
|
-
input_type: str,
|
|
34
|
-
truncate: str,
|
|
35
|
-
filter_errors: bool,
|
|
36
|
-
) -> list:
|
|
37
|
-
"""
|
|
38
|
-
Interacts directly with the NIM embedding service to calculate embeddings for a batch of prompts.
|
|
39
|
-
|
|
40
|
-
Parameters
|
|
41
|
-
----------
|
|
42
|
-
prompts : List[str]
|
|
43
|
-
A list of prompt strings for which embeddings are to be calculated.
|
|
44
|
-
api_key : str
|
|
45
|
-
API key for authentication with the embedding service.
|
|
46
|
-
embedding_nim_endpoint : str
|
|
47
|
-
Base URL for the NIM embedding service.
|
|
48
|
-
embedding_model : str
|
|
49
|
-
The model to use for generating embeddings.
|
|
50
|
-
encoding_format : str
|
|
51
|
-
The desired encoding format.
|
|
52
|
-
input_type : str
|
|
53
|
-
The type of input data.
|
|
54
|
-
truncate : str
|
|
55
|
-
Truncation setting for the input data.
|
|
56
|
-
filter_errors : bool
|
|
57
|
-
Flag indicating whether to filter errors in the response.
|
|
58
|
-
|
|
59
|
-
Returns
|
|
60
|
-
-------
|
|
61
|
-
list
|
|
62
|
-
A dictionary with keys "embedding" (the embedding results) and "info_msg" (any error info).
|
|
63
|
-
|
|
64
|
-
Raises
|
|
65
|
-
------
|
|
66
|
-
RuntimeError
|
|
67
|
-
If an error occurs during the embedding request, with an info message attached.
|
|
68
|
-
"""
|
|
69
|
-
response = {}
|
|
70
|
-
|
|
71
|
-
try:
|
|
72
|
-
client = OpenAI(
|
|
73
|
-
api_key=api_key,
|
|
74
|
-
base_url=embedding_nim_endpoint,
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
resp = client.embeddings.create(
|
|
78
|
-
input=prompts,
|
|
79
|
-
model=embedding_model,
|
|
80
|
-
encoding_format=encoding_format,
|
|
81
|
-
extra_body={"input_type": input_type, "truncate": truncate},
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
response["embedding"] = resp.data
|
|
85
|
-
response["info_msg"] = None
|
|
86
|
-
|
|
87
|
-
except Exception as err:
|
|
88
|
-
info_msg = {
|
|
89
|
-
"task": TaskTypeEnum.EMBED.value,
|
|
90
|
-
"status": StatusEnum.ERROR.value,
|
|
91
|
-
"message": f"Embedding error: {err}",
|
|
92
|
-
"filter": filter_errors,
|
|
93
|
-
}
|
|
94
|
-
validated_info_msg = validate_schema(info_msg, InfoMessageMetadataSchema).model_dump()
|
|
95
|
-
|
|
96
|
-
response["embedding"] = [None] * len(prompts)
|
|
97
|
-
response["info_msg"] = validated_info_msg
|
|
98
|
-
|
|
99
|
-
raise RuntimeError(f"Embedding error occurred. Info message: {validated_info_msg}") from err
|
|
100
|
-
|
|
101
|
-
return response
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def _async_request_handler(
|
|
105
|
-
prompts: List[str],
|
|
106
|
-
api_key: str,
|
|
107
|
-
embedding_nim_endpoint: str,
|
|
108
|
-
embedding_model: str,
|
|
109
|
-
encoding_format: str,
|
|
110
|
-
input_type: str,
|
|
111
|
-
truncate: str,
|
|
112
|
-
filter_errors: bool,
|
|
113
|
-
) -> List[dict]:
|
|
114
|
-
"""
|
|
115
|
-
Gathers calculated embedding results from the NIM embedding service concurrently.
|
|
116
|
-
|
|
117
|
-
Parameters
|
|
118
|
-
----------
|
|
119
|
-
prompts : List[str]
|
|
120
|
-
A list of prompt batches.
|
|
121
|
-
api_key : str
|
|
122
|
-
API key for authentication.
|
|
123
|
-
embedding_nim_endpoint : str
|
|
124
|
-
Base URL for the NIM embedding service.
|
|
125
|
-
embedding_model : str
|
|
126
|
-
The model to use for generating embeddings.
|
|
127
|
-
encoding_format : str
|
|
128
|
-
The desired encoding format.
|
|
129
|
-
input_type : str
|
|
130
|
-
The type of input data.
|
|
131
|
-
truncate : str
|
|
132
|
-
Truncation setting for the input data.
|
|
133
|
-
filter_errors : bool
|
|
134
|
-
Flag indicating whether to filter errors in the response.
|
|
135
|
-
|
|
136
|
-
Returns
|
|
137
|
-
-------
|
|
138
|
-
List[dict]
|
|
139
|
-
A list of response dictionaries from the embedding service.
|
|
140
|
-
"""
|
|
141
|
-
with ThreadPoolExecutor() as executor:
|
|
142
|
-
futures = [
|
|
143
|
-
executor.submit(
|
|
144
|
-
_make_async_request,
|
|
145
|
-
prompts=prompt_batch,
|
|
146
|
-
api_key=api_key,
|
|
147
|
-
embedding_nim_endpoint=embedding_nim_endpoint,
|
|
148
|
-
embedding_model=embedding_model,
|
|
149
|
-
encoding_format=encoding_format,
|
|
150
|
-
input_type=input_type,
|
|
151
|
-
truncate=truncate,
|
|
152
|
-
filter_errors=filter_errors,
|
|
153
|
-
)
|
|
154
|
-
for prompt_batch in prompts
|
|
155
|
-
]
|
|
156
|
-
results = [future.result() for future in futures]
|
|
157
|
-
|
|
158
|
-
return results
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def _async_runner(
|
|
162
|
-
prompts: List[str],
|
|
163
|
-
api_key: str,
|
|
164
|
-
embedding_nim_endpoint: str,
|
|
165
|
-
embedding_model: str,
|
|
166
|
-
encoding_format: str,
|
|
167
|
-
input_type: str,
|
|
168
|
-
truncate: str,
|
|
169
|
-
filter_errors: bool,
|
|
170
|
-
) -> dict:
|
|
171
|
-
"""
|
|
172
|
-
Concurrently launches all NIM embedding requests and flattens the results.
|
|
173
|
-
|
|
174
|
-
Parameters
|
|
175
|
-
----------
|
|
176
|
-
prompts : List[str]
|
|
177
|
-
A list of prompt batches.
|
|
178
|
-
api_key : str
|
|
179
|
-
API key for authentication.
|
|
180
|
-
embedding_nim_endpoint : str
|
|
181
|
-
Base URL for the NIM embedding service.
|
|
182
|
-
embedding_model : str
|
|
183
|
-
The model to use for generating embeddings.
|
|
184
|
-
encoding_format : str
|
|
185
|
-
The desired encoding format.
|
|
186
|
-
input_type : str
|
|
187
|
-
The type of input data.
|
|
188
|
-
truncate : str
|
|
189
|
-
Truncation setting for the input data.
|
|
190
|
-
filter_errors : bool
|
|
191
|
-
Flag indicating whether to filter errors in the response.
|
|
192
|
-
|
|
193
|
-
Returns
|
|
194
|
-
-------
|
|
195
|
-
dict
|
|
196
|
-
A dictionary with keys "embeddings" (flattened embedding results) and "info_msgs" (error messages).
|
|
197
|
-
"""
|
|
198
|
-
results = _async_request_handler(
|
|
199
|
-
prompts,
|
|
200
|
-
api_key,
|
|
201
|
-
embedding_nim_endpoint,
|
|
202
|
-
embedding_model,
|
|
203
|
-
encoding_format,
|
|
204
|
-
input_type,
|
|
205
|
-
truncate,
|
|
206
|
-
filter_errors,
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
flat_results = {"embeddings": [], "info_msgs": []}
|
|
210
|
-
for batch_dict in results:
|
|
211
|
-
info_msg = batch_dict["info_msg"]
|
|
212
|
-
for embedding in batch_dict["embedding"]:
|
|
213
|
-
if not isinstance(embedding, list):
|
|
214
|
-
if embedding is not None:
|
|
215
|
-
flat_results["embeddings"].append(embedding.embedding)
|
|
216
|
-
else:
|
|
217
|
-
flat_results["embeddings"].append(embedding)
|
|
218
|
-
else:
|
|
219
|
-
flat_results["embeddings"].append(embedding)
|
|
220
|
-
flat_results["info_msgs"].append(info_msg)
|
|
221
|
-
|
|
222
|
-
return flat_results
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
# ------------------------------------------------------------------------------
|
|
226
|
-
# Pandas UDFs for Content Extraction
|
|
227
|
-
# ------------------------------------------------------------------------------
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
def _add_embeddings(row, embeddings, info_msgs):
|
|
231
|
-
"""
|
|
232
|
-
Updates a DataFrame row with embedding data and associated error info.
|
|
233
|
-
|
|
234
|
-
Parameters
|
|
235
|
-
----------
|
|
236
|
-
row : pandas.Series
|
|
237
|
-
A row of the DataFrame.
|
|
238
|
-
embeddings : list
|
|
239
|
-
List of embeddings corresponding to DataFrame rows.
|
|
240
|
-
info_msgs : list
|
|
241
|
-
List of info message dictionaries corresponding to DataFrame rows.
|
|
242
|
-
|
|
243
|
-
Returns
|
|
244
|
-
-------
|
|
245
|
-
pandas.Series
|
|
246
|
-
The updated row with embedding and info message metadata added.
|
|
247
|
-
"""
|
|
248
|
-
row["metadata"]["embedding"] = embeddings[row.name]
|
|
249
|
-
if info_msgs[row.name] is not None:
|
|
250
|
-
row["metadata"]["info_message_metadata"] = info_msgs[row.name]
|
|
251
|
-
row["document_type"] = ContentTypeEnum.INFO_MSG
|
|
252
|
-
row["_contains_embeddings"] = False
|
|
253
|
-
else:
|
|
254
|
-
row["_contains_embeddings"] = True
|
|
255
|
-
|
|
256
|
-
return row
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
def _get_pandas_text_content(row):
|
|
260
|
-
"""
|
|
261
|
-
Extracts text content from a DataFrame row.
|
|
262
|
-
|
|
263
|
-
Parameters
|
|
264
|
-
----------
|
|
265
|
-
row : pandas.Series
|
|
266
|
-
A row containing the 'content' key.
|
|
267
|
-
|
|
268
|
-
Returns
|
|
269
|
-
-------
|
|
270
|
-
str
|
|
271
|
-
The text content from the row.
|
|
272
|
-
"""
|
|
273
|
-
return row["content"]
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
def _get_pandas_table_content(row):
|
|
277
|
-
"""
|
|
278
|
-
Extracts table/chart content from a DataFrame row.
|
|
279
|
-
|
|
280
|
-
Parameters
|
|
281
|
-
----------
|
|
282
|
-
row : pandas.Series
|
|
283
|
-
A row containing 'table_metadata' with 'table_content'.
|
|
284
|
-
|
|
285
|
-
Returns
|
|
286
|
-
-------
|
|
287
|
-
str
|
|
288
|
-
The table/chart content from the row.
|
|
289
|
-
"""
|
|
290
|
-
return row["table_metadata"]["table_content"]
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
def _get_pandas_image_content(row):
|
|
294
|
-
"""
|
|
295
|
-
Extracts image caption content from a DataFrame row.
|
|
296
|
-
|
|
297
|
-
Parameters
|
|
298
|
-
----------
|
|
299
|
-
row : pandas.Series
|
|
300
|
-
A row containing 'image_metadata' with 'caption'.
|
|
301
|
-
|
|
302
|
-
Returns
|
|
303
|
-
-------
|
|
304
|
-
str
|
|
305
|
-
The image caption from the row.
|
|
306
|
-
"""
|
|
307
|
-
return row["image_metadata"]["caption"]
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
# ------------------------------------------------------------------------------
|
|
311
|
-
# Batch Processing Utilities
|
|
312
|
-
# ------------------------------------------------------------------------------
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
def _batch_generator(iterable: Iterable, batch_size: int = 10):
|
|
316
|
-
"""
|
|
317
|
-
Yields batches of a specified size from an iterable.
|
|
318
|
-
|
|
319
|
-
Parameters
|
|
320
|
-
----------
|
|
321
|
-
iterable : Iterable
|
|
322
|
-
The iterable to batch.
|
|
323
|
-
batch_size : int, optional
|
|
324
|
-
The size of each batch (default is 10).
|
|
325
|
-
|
|
326
|
-
Yields
|
|
327
|
-
------
|
|
328
|
-
list
|
|
329
|
-
A batch of items from the iterable.
|
|
330
|
-
"""
|
|
331
|
-
iter_len = len(iterable)
|
|
332
|
-
for idx in range(0, iter_len, batch_size):
|
|
333
|
-
yield iterable[idx : min(idx + batch_size, iter_len)]
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
def _generate_batches(prompts: List[str], batch_size: int = 100) -> List[str]:
|
|
337
|
-
"""
|
|
338
|
-
Splits a list of prompts into batches.
|
|
339
|
-
|
|
340
|
-
Parameters
|
|
341
|
-
----------
|
|
342
|
-
prompts : List[str]
|
|
343
|
-
The list of prompt strings.
|
|
344
|
-
batch_size : int, optional
|
|
345
|
-
The desired batch size (default is 100).
|
|
346
|
-
|
|
347
|
-
Returns
|
|
348
|
-
-------
|
|
349
|
-
List[List[str]]
|
|
350
|
-
A list of batches, each containing a subset of the prompts.
|
|
351
|
-
"""
|
|
352
|
-
return [batch for batch in _batch_generator(prompts, batch_size)]
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
def _get_pandas_audio_content(row):
|
|
356
|
-
"""
|
|
357
|
-
A pandas UDF used to select extracted audio transcription to be used to create embeddings.
|
|
358
|
-
"""
|
|
359
|
-
return row["audio_metadata"]["audio_transcript"]
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
# ------------------------------------------------------------------------------
|
|
363
|
-
# DataFrame Concatenation Utility
|
|
364
|
-
# ------------------------------------------------------------------------------
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
def _concatenate_extractions_pandas(
|
|
368
|
-
base_df: pd.DataFrame, dataframes: List[pd.DataFrame], masks: List[pd.Series]
|
|
369
|
-
) -> pd.DataFrame:
|
|
370
|
-
"""
|
|
371
|
-
Concatenates processed DataFrame rows (with embeddings) with unprocessed rows from the base DataFrame.
|
|
372
|
-
|
|
373
|
-
Parameters
|
|
374
|
-
----------
|
|
375
|
-
base_df : pd.DataFrame
|
|
376
|
-
The original DataFrame.
|
|
377
|
-
dataframes : List[pd.DataFrame]
|
|
378
|
-
List of DataFrames that have been enriched with embeddings.
|
|
379
|
-
masks : List[pd.Series]
|
|
380
|
-
List of boolean masks indicating the rows that were processed.
|
|
381
|
-
|
|
382
|
-
Returns
|
|
383
|
-
-------
|
|
384
|
-
pd.DataFrame
|
|
385
|
-
The concatenated DataFrame with embeddings applied where available.
|
|
386
|
-
"""
|
|
387
|
-
unified_mask = pd.Series(False, index=base_df.index)
|
|
388
|
-
for mask in masks:
|
|
389
|
-
unified_mask = unified_mask | mask
|
|
390
|
-
|
|
391
|
-
df_no_text = base_df.loc[~unified_mask].copy()
|
|
392
|
-
df_no_text["_contains_embeddings"] = False
|
|
393
|
-
|
|
394
|
-
dataframes.append(df_no_text)
|
|
395
|
-
combined_df = pd.concat(dataframes, axis=0, ignore_index=True).reset_index(drop=True)
|
|
396
|
-
return combined_df
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
# ------------------------------------------------------------------------------
|
|
400
|
-
# Embedding Extraction Pipeline
|
|
401
|
-
# ------------------------------------------------------------------------------
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
def transform_create_text_embeddings_internal(
|
|
405
|
-
df_transform_ledger: pd.DataFrame,
|
|
406
|
-
task_config: Dict[str, Any],
|
|
407
|
-
transform_config: TextEmbeddingSchema = TextEmbeddingSchema(),
|
|
408
|
-
execution_trace_log: Optional[Dict] = None,
|
|
409
|
-
) -> Tuple[pd.DataFrame, Dict]:
|
|
410
|
-
"""
|
|
411
|
-
Generates text embeddings for supported content types (TEXT, STRUCTURED, IMAGE)
|
|
412
|
-
from a pandas DataFrame using asynchronous requests.
|
|
413
|
-
|
|
414
|
-
Parameters
|
|
415
|
-
----------
|
|
416
|
-
df_transform_ledger : pd.DataFrame
|
|
417
|
-
The DataFrame containing content for embedding extraction.
|
|
418
|
-
task_config : Dict[str, Any]
|
|
419
|
-
Dictionary containing task properties (e.g., filter error flag).
|
|
420
|
-
transform_config : Any
|
|
421
|
-
Validated configuration for text embedding extraction (EmbedExtractionsSchema).
|
|
422
|
-
execution_trace_log : Optional[Dict], optional
|
|
423
|
-
Optional trace information for debugging or logging (default is None).
|
|
424
|
-
|
|
425
|
-
Returns
|
|
426
|
-
-------
|
|
427
|
-
Tuple[pd.DataFrame, Dict]
|
|
428
|
-
A tuple containing:
|
|
429
|
-
- The updated DataFrame with embeddings applied.
|
|
430
|
-
- A dictionary with trace information.
|
|
431
|
-
"""
|
|
432
|
-
_ = task_config # Currently unused.
|
|
433
|
-
|
|
434
|
-
if execution_trace_log is None:
|
|
435
|
-
execution_trace_log = {}
|
|
436
|
-
logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
|
|
437
|
-
|
|
438
|
-
# TODO(Devin)
|
|
439
|
-
if df_transform_ledger.empty:
|
|
440
|
-
return df_transform_ledger, {"trace_info": execution_trace_log}
|
|
441
|
-
|
|
442
|
-
embedding_dataframes = []
|
|
443
|
-
content_masks = [] # List of pandas boolean Series
|
|
444
|
-
|
|
445
|
-
# Define pandas content extractors for supported content types.
|
|
446
|
-
pandas_content_extractor = {
|
|
447
|
-
ContentTypeEnum.TEXT: _get_pandas_text_content,
|
|
448
|
-
ContentTypeEnum.STRUCTURED: _get_pandas_table_content,
|
|
449
|
-
ContentTypeEnum.IMAGE: _get_pandas_image_content,
|
|
450
|
-
ContentTypeEnum.AUDIO: _get_pandas_audio_content,
|
|
451
|
-
ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
|
|
452
|
-
}
|
|
453
|
-
|
|
454
|
-
logger.debug("Generating text embeddings for supported content types: TEXT, STRUCTURED, IMAGE.")
|
|
455
|
-
|
|
456
|
-
# Process each supported content type.
|
|
457
|
-
for content_type, content_getter in pandas_content_extractor.items():
|
|
458
|
-
if not content_getter:
|
|
459
|
-
logger.debug(f"Skipping unsupported content type: {content_type}")
|
|
460
|
-
continue
|
|
461
|
-
|
|
462
|
-
content_mask = df_transform_ledger["document_type"] == content_type.value
|
|
463
|
-
if not content_mask.any():
|
|
464
|
-
continue
|
|
465
|
-
|
|
466
|
-
# Extract content from metadata and filter out rows with empty content.
|
|
467
|
-
extracted_content = df_transform_ledger.loc[content_mask, "metadata"].apply(content_getter)
|
|
468
|
-
non_empty_mask = extracted_content.notna() & (extracted_content.str.strip() != "")
|
|
469
|
-
final_mask = content_mask & non_empty_mask
|
|
470
|
-
if not final_mask.any():
|
|
471
|
-
continue
|
|
472
|
-
|
|
473
|
-
df_content = df_transform_ledger.loc[final_mask].copy().reset_index(drop=True)
|
|
474
|
-
filtered_content = df_content["metadata"].apply(content_getter)
|
|
475
|
-
filtered_content_batches = _generate_batches(filtered_content.tolist(), batch_size=transform_config.batch_size)
|
|
476
|
-
content_embeddings = _async_runner(
|
|
477
|
-
filtered_content_batches,
|
|
478
|
-
transform_config.api_key,
|
|
479
|
-
transform_config.embedding_nim_endpoint,
|
|
480
|
-
transform_config.embedding_model,
|
|
481
|
-
transform_config.encoding_format,
|
|
482
|
-
transform_config.input_type,
|
|
483
|
-
transform_config.truncate,
|
|
484
|
-
False,
|
|
485
|
-
)
|
|
486
|
-
# Apply the embeddings (and any error info) to each row.
|
|
487
|
-
df_content[["metadata", "document_type", "_contains_embeddings"]] = df_content.apply(
|
|
488
|
-
_add_embeddings, **content_embeddings, axis=1
|
|
489
|
-
)[["metadata", "document_type", "_contains_embeddings"]]
|
|
490
|
-
df_content["_content"] = filtered_content
|
|
491
|
-
|
|
492
|
-
embedding_dataframes.append(df_content)
|
|
493
|
-
content_masks.append(final_mask)
|
|
494
|
-
|
|
495
|
-
combined_df = _concatenate_extractions_pandas(df_transform_ledger, embedding_dataframes, content_masks)
|
|
496
|
-
return combined_df, {"trace_info": execution_trace_log}
|