nv-ingest-api 2025.4.18.dev20250418__py3-none-any.whl → 2025.4.20.dev20250420__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +0 -3
  2. nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
  3. nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/METADATA +1 -1
  5. nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +9 -0
  6. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/WHEEL +1 -1
  7. nv_ingest_api/interface/__init__.py +0 -215
  8. nv_ingest_api/interface/extract.py +0 -972
  9. nv_ingest_api/interface/mutate.py +0 -154
  10. nv_ingest_api/interface/store.py +0 -218
  11. nv_ingest_api/interface/transform.py +0 -382
  12. nv_ingest_api/interface/utility.py +0 -200
  13. nv_ingest_api/internal/enums/__init__.py +0 -3
  14. nv_ingest_api/internal/enums/common.py +0 -494
  15. nv_ingest_api/internal/extract/__init__.py +0 -3
  16. nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  17. nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  18. nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  19. nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  20. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  21. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  22. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  23. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  24. nv_ingest_api/internal/extract/image/__init__.py +0 -3
  25. nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  26. nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  27. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  28. nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  29. nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  30. nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  31. nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  32. nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  33. nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  34. nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  35. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  36. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  37. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  38. nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  39. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  40. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  41. nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  42. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  43. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  44. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  45. nv_ingest_api/internal/mutate/__init__.py +0 -3
  46. nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  47. nv_ingest_api/internal/mutate/filter.py +0 -133
  48. nv_ingest_api/internal/primitives/__init__.py +0 -0
  49. nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  50. nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  51. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  52. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  53. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  54. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  55. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
  56. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  57. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
  58. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  59. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  60. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  61. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  62. nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  63. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  64. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  65. nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  66. nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  67. nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  68. nv_ingest_api/internal/schemas/__init__.py +0 -3
  69. nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  70. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  71. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  72. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  73. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  74. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  75. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  76. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  77. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  78. nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  79. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  80. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  81. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  82. nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  83. nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  84. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  85. nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  86. nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  87. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  88. nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  89. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  90. nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  91. nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  92. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  93. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  94. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  95. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  96. nv_ingest_api/internal/store/__init__.py +0 -3
  97. nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  98. nv_ingest_api/internal/store/image_upload.py +0 -232
  99. nv_ingest_api/internal/transform/__init__.py +0 -3
  100. nv_ingest_api/internal/transform/caption_image.py +0 -205
  101. nv_ingest_api/internal/transform/embed_text.py +0 -496
  102. nv_ingest_api/internal/transform/split_text.py +0 -157
  103. nv_ingest_api/util/__init__.py +0 -0
  104. nv_ingest_api/util/control_message/__init__.py +0 -0
  105. nv_ingest_api/util/control_message/validators.py +0 -47
  106. nv_ingest_api/util/converters/__init__.py +0 -0
  107. nv_ingest_api/util/converters/bytetools.py +0 -78
  108. nv_ingest_api/util/converters/containers.py +0 -65
  109. nv_ingest_api/util/converters/datetools.py +0 -90
  110. nv_ingest_api/util/converters/dftools.py +0 -127
  111. nv_ingest_api/util/converters/formats.py +0 -64
  112. nv_ingest_api/util/converters/type_mappings.py +0 -27
  113. nv_ingest_api/util/detectors/__init__.py +0 -5
  114. nv_ingest_api/util/detectors/language.py +0 -38
  115. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  116. nv_ingest_api/util/exception_handlers/converters.py +0 -72
  117. nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  118. nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  119. nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  120. nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  121. nv_ingest_api/util/image_processing/__init__.py +0 -5
  122. nv_ingest_api/util/image_processing/clustering.py +0 -260
  123. nv_ingest_api/util/image_processing/processing.py +0 -179
  124. nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  125. nv_ingest_api/util/image_processing/transforms.py +0 -407
  126. nv_ingest_api/util/logging/__init__.py +0 -0
  127. nv_ingest_api/util/logging/configuration.py +0 -31
  128. nv_ingest_api/util/message_brokers/__init__.py +0 -3
  129. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  130. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  131. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  132. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -451
  133. nv_ingest_api/util/metadata/__init__.py +0 -5
  134. nv_ingest_api/util/metadata/aggregators.py +0 -469
  135. nv_ingest_api/util/multi_processing/__init__.py +0 -8
  136. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  137. nv_ingest_api/util/nim/__init__.py +0 -56
  138. nv_ingest_api/util/pdf/__init__.py +0 -3
  139. nv_ingest_api/util/pdf/pdfium.py +0 -427
  140. nv_ingest_api/util/schema/__init__.py +0 -0
  141. nv_ingest_api/util/schema/schema_validator.py +0 -10
  142. nv_ingest_api/util/service_clients/__init__.py +0 -3
  143. nv_ingest_api/util/service_clients/client_base.py +0 -86
  144. nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  145. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  146. nv_ingest_api/util/service_clients/redis/redis_client.py +0 -823
  147. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  148. nv_ingest_api/util/service_clients/rest/rest_client.py +0 -531
  149. nv_ingest_api/util/string_processing/__init__.py +0 -51
  150. nv_ingest_api-2025.4.18.dev20250418.dist-info/RECORD +0 -152
  151. /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
  152. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/top_level.txt +0 -0
@@ -1,205 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import logging
6
- from typing import Any, Dict, List, Optional, Tuple, Union
7
-
8
- import pandas as pd
9
- from pydantic import BaseModel
10
-
11
- from nv_ingest_api.internal.primitives.nim.model_interface.vlm import VLMModelInterface
12
- from nv_ingest_api.internal.enums.common import ContentTypeEnum
13
- from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
14
- from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
15
- from nv_ingest_api.util.nim import create_inference_client
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- def _prepare_dataframes_mod(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
21
- """
22
- Prepares and returns three DataFrame-related objects from the input DataFrame.
23
-
24
- The function performs the following:
25
- 1. Checks if the DataFrame is empty or if the "document_type" column is missing.
26
- In such a case, returns the original DataFrame, an empty DataFrame, and an empty boolean Series.
27
- 2. Otherwise, it creates a boolean Series identifying rows where "document_type" equals IMAGE.
28
- 3. Extracts a DataFrame containing only those rows.
29
-
30
- Parameters
31
- ----------
32
- df : pd.DataFrame
33
- The input DataFrame that should contain a "document_type" column.
34
-
35
- Returns
36
- -------
37
- Tuple[pd.DataFrame, pd.DataFrame, pd.Series]
38
- A tuple containing:
39
- - The original DataFrame.
40
- - A DataFrame filtered to rows where "document_type" is IMAGE.
41
- - A boolean Series indicating which rows in the original DataFrame are IMAGE rows.
42
- """
43
- try:
44
- if df.empty or "document_type" not in df.columns:
45
- return df, pd.DataFrame(), pd.Series(dtype=bool)
46
-
47
- bool_index: pd.Series = df["document_type"] == ContentTypeEnum.IMAGE
48
- df_matched: pd.DataFrame = df.loc[bool_index]
49
-
50
- return df, df_matched, bool_index
51
-
52
- except Exception as e:
53
- err_msg = f"_prepare_dataframes_mod: Error preparing dataframes. Original error: {e}"
54
- logger.error(err_msg, exc_info=True)
55
- raise type(e)(err_msg) from e
56
-
57
-
58
- def _generate_captions(
59
- base64_images: List[str], prompt: str, api_key: str, endpoint_url: str, model_name: str
60
- ) -> List[str]:
61
- """
62
- Generates captions for a list of base64-encoded PNG images using the VLM model API.
63
-
64
- This function performs the following steps:
65
- 1. Scales each image to meet encoding size requirements using `scale_image_to_encoding_size`.
66
- 2. Constructs the input payload containing the scaled images and the provided prompt.
67
- 3. Creates an inference client using the VLMModelInterface.
68
- 4. Calls the client's infer method to obtain a list of captions corresponding to the images.
69
-
70
- Parameters
71
- ----------
72
- base64_images : List[str]
73
- List of base64-encoded PNG image strings.
74
- prompt : str
75
- Text prompt to guide caption generation.
76
- api_key : str
77
- API key for authenticating with the VLM endpoint.
78
- endpoint_url : str
79
- URL of the VLM model HTTP endpoint.
80
- model_name : str
81
- The name of the model to use for inference.
82
-
83
- Returns
84
- -------
85
- List[str]
86
- A list of generated captions, each corresponding to an input image.
87
-
88
- Raises
89
- ------
90
- Exception
91
- Propagates any exception encountered during caption generation, with added context.
92
- """
93
- try:
94
- # Scale each image to ensure it meets encoding size requirements.
95
- scaled_images: List[str] = [scale_image_to_encoding_size(b64)[0] for b64 in base64_images]
96
-
97
- # Build the input payload for the VLM model.
98
- data: Dict[str, Any] = {
99
- "base64_images": scaled_images,
100
- "prompt": prompt,
101
- }
102
-
103
- # Create the inference client using the VLMModelInterface.
104
- nim_client = create_inference_client(
105
- model_interface=VLMModelInterface(),
106
- endpoints=(None, endpoint_url),
107
- auth_token=api_key,
108
- infer_protocol="http",
109
- )
110
-
111
- logger.debug(f"Calling VLM endpoint: {endpoint_url} with model: {model_name}")
112
- # Perform inference to generate captions.
113
- captions: List[str] = nim_client.infer(data, model_name=model_name)
114
- return captions
115
-
116
- except Exception as e:
117
- err_msg = f"_generate_captions: Error generating captions: {e}"
118
- logger.error(err_msg, exc_info=True)
119
- raise type(e)(err_msg) from e
120
-
121
-
122
- @unified_exception_handler
123
- def transform_image_create_vlm_caption_internal(
124
- df_transform_ledger: pd.DataFrame,
125
- task_config: Union[BaseModel, Dict[str, Any]],
126
- transform_config: Any,
127
- execution_trace_log: Optional[Dict[str, Any]] = None,
128
- ) -> pd.DataFrame:
129
- """
130
- Extracts and adds captions for image content in a DataFrame using the VLM model API.
131
-
132
- This function updates the 'metadata' column for rows where the content type is "image".
133
- It uses configuration values from task_config (or falls back to transform_config defaults)
134
- to determine the API key, prompt, endpoint URL, and model name for caption generation.
135
- The generated captions are added under the 'image_metadata.caption' key in the metadata.
136
-
137
- Parameters
138
- ----------
139
- df_transform_ledger : pd.DataFrame
140
- The input DataFrame containing image data. Each row must have a 'metadata' column
141
- with at least the 'content' and 'content_metadata' keys.
142
- task_config : Union[BaseModel, Dict[str, Any]]
143
- Configuration parameters for caption extraction. If provided as a Pydantic model,
144
- it will be converted to a dictionary. Expected keys include "api_key", "prompt",
145
- "endpoint_url", and "model_name".
146
- transform_config : Any
147
- A configuration object providing default values for caption extraction. It should have
148
- attributes: api_key, prompt, endpoint_url, and model_name.
149
- execution_trace_log : Optional[Dict[str, Any]], default=None
150
- Optional trace information for debugging or logging purposes.
151
-
152
- Returns
153
- -------
154
- pd.DataFrame
155
- The updated DataFrame with generated captions added to the 'image_metadata.caption' field
156
- within the 'metadata' column for each image row.
157
-
158
- Raises
159
- ------
160
- Exception
161
- Propagates any exception encountered during the caption extraction process, with added context.
162
- """
163
-
164
- _ = execution_trace_log # Unused variable; placeholder to prevent linter warnings.
165
-
166
- logger.debug("Attempting to caption image content")
167
-
168
- # Convert task_config to dictionary if it is a Pydantic model.
169
- if isinstance(task_config, BaseModel):
170
- task_config = task_config.model_dump()
171
-
172
- # Retrieve configuration values with fallback to transform_config defaults.
173
- api_key: str = task_config.get("api_key") or transform_config.api_key
174
- prompt: str = task_config.get("prompt") or transform_config.prompt
175
- endpoint_url: str = task_config.get("endpoint_url") or transform_config.endpoint_url
176
- model_name: str = task_config.get("model_name") or transform_config.model_name
177
-
178
- # Create a mask for rows where the content type is "image".
179
- df_mask: pd.Series = df_transform_ledger["metadata"].apply(
180
- lambda meta: meta.get("content_metadata", {}).get("type") == "image"
181
- )
182
-
183
- # If no image rows exist, return the original DataFrame.
184
- if not df_mask.any():
185
- return df_transform_ledger
186
-
187
- # Collect base64-encoded images from the rows where the content type is "image".
188
- base64_images: List[str] = df_transform_ledger.loc[df_mask, "metadata"].apply(lambda meta: meta["content"]).tolist()
189
-
190
- # Generate captions for the collected images.
191
- captions: List[str] = _generate_captions(base64_images, prompt, api_key, endpoint_url, model_name)
192
-
193
- # Update the DataFrame: assign each generated caption to the corresponding row.
194
- for idx, caption in zip(df_transform_ledger.loc[df_mask].index, captions):
195
- meta: Dict[str, Any] = df_transform_ledger.at[idx, "metadata"]
196
- image_meta: Dict[str, Any] = meta.get("image_metadata", {})
197
- image_meta["caption"] = caption
198
- meta["image_metadata"] = image_meta
199
- df_transform_ledger.at[idx, "metadata"] = meta
200
-
201
- logger.debug("Image content captioning complete")
202
- result, execution_trace_log = df_transform_ledger, {}
203
- _ = execution_trace_log # Unused variable; placeholder to prevent linter warnings.
204
-
205
- return result
@@ -1,496 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import logging
6
- from concurrent.futures import ThreadPoolExecutor
7
- from typing import Any, Dict, Tuple, Optional, Iterable, List
8
-
9
- import pandas as pd
10
- from openai import OpenAI
11
-
12
- from nv_ingest_api.internal.enums.common import ContentTypeEnum, StatusEnum, TaskTypeEnum
13
- from nv_ingest_api.internal.schemas.meta.metadata_schema import (
14
- InfoMessageMetadataSchema,
15
- )
16
- from nv_ingest_api.internal.schemas.transform.transform_text_embedding_schema import TextEmbeddingSchema
17
- from nv_ingest_api.util.schema.schema_validator import validate_schema
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- # ------------------------------------------------------------------------------
23
- # Asynchronous Embedding Requests
24
- # ------------------------------------------------------------------------------
25
-
26
-
27
- def _make_async_request(
28
- prompts: List[str],
29
- api_key: str,
30
- embedding_nim_endpoint: str,
31
- embedding_model: str,
32
- encoding_format: str,
33
- input_type: str,
34
- truncate: str,
35
- filter_errors: bool,
36
- ) -> list:
37
- """
38
- Interacts directly with the NIM embedding service to calculate embeddings for a batch of prompts.
39
-
40
- Parameters
41
- ----------
42
- prompts : List[str]
43
- A list of prompt strings for which embeddings are to be calculated.
44
- api_key : str
45
- API key for authentication with the embedding service.
46
- embedding_nim_endpoint : str
47
- Base URL for the NIM embedding service.
48
- embedding_model : str
49
- The model to use for generating embeddings.
50
- encoding_format : str
51
- The desired encoding format.
52
- input_type : str
53
- The type of input data.
54
- truncate : str
55
- Truncation setting for the input data.
56
- filter_errors : bool
57
- Flag indicating whether to filter errors in the response.
58
-
59
- Returns
60
- -------
61
- list
62
- A dictionary with keys "embedding" (the embedding results) and "info_msg" (any error info).
63
-
64
- Raises
65
- ------
66
- RuntimeError
67
- If an error occurs during the embedding request, with an info message attached.
68
- """
69
- response = {}
70
-
71
- try:
72
- client = OpenAI(
73
- api_key=api_key,
74
- base_url=embedding_nim_endpoint,
75
- )
76
-
77
- resp = client.embeddings.create(
78
- input=prompts,
79
- model=embedding_model,
80
- encoding_format=encoding_format,
81
- extra_body={"input_type": input_type, "truncate": truncate},
82
- )
83
-
84
- response["embedding"] = resp.data
85
- response["info_msg"] = None
86
-
87
- except Exception as err:
88
- info_msg = {
89
- "task": TaskTypeEnum.EMBED.value,
90
- "status": StatusEnum.ERROR.value,
91
- "message": f"Embedding error: {err}",
92
- "filter": filter_errors,
93
- }
94
- validated_info_msg = validate_schema(info_msg, InfoMessageMetadataSchema).model_dump()
95
-
96
- response["embedding"] = [None] * len(prompts)
97
- response["info_msg"] = validated_info_msg
98
-
99
- raise RuntimeError(f"Embedding error occurred. Info message: {validated_info_msg}") from err
100
-
101
- return response
102
-
103
-
104
- def _async_request_handler(
105
- prompts: List[str],
106
- api_key: str,
107
- embedding_nim_endpoint: str,
108
- embedding_model: str,
109
- encoding_format: str,
110
- input_type: str,
111
- truncate: str,
112
- filter_errors: bool,
113
- ) -> List[dict]:
114
- """
115
- Gathers calculated embedding results from the NIM embedding service concurrently.
116
-
117
- Parameters
118
- ----------
119
- prompts : List[str]
120
- A list of prompt batches.
121
- api_key : str
122
- API key for authentication.
123
- embedding_nim_endpoint : str
124
- Base URL for the NIM embedding service.
125
- embedding_model : str
126
- The model to use for generating embeddings.
127
- encoding_format : str
128
- The desired encoding format.
129
- input_type : str
130
- The type of input data.
131
- truncate : str
132
- Truncation setting for the input data.
133
- filter_errors : bool
134
- Flag indicating whether to filter errors in the response.
135
-
136
- Returns
137
- -------
138
- List[dict]
139
- A list of response dictionaries from the embedding service.
140
- """
141
- with ThreadPoolExecutor() as executor:
142
- futures = [
143
- executor.submit(
144
- _make_async_request,
145
- prompts=prompt_batch,
146
- api_key=api_key,
147
- embedding_nim_endpoint=embedding_nim_endpoint,
148
- embedding_model=embedding_model,
149
- encoding_format=encoding_format,
150
- input_type=input_type,
151
- truncate=truncate,
152
- filter_errors=filter_errors,
153
- )
154
- for prompt_batch in prompts
155
- ]
156
- results = [future.result() for future in futures]
157
-
158
- return results
159
-
160
-
161
- def _async_runner(
162
- prompts: List[str],
163
- api_key: str,
164
- embedding_nim_endpoint: str,
165
- embedding_model: str,
166
- encoding_format: str,
167
- input_type: str,
168
- truncate: str,
169
- filter_errors: bool,
170
- ) -> dict:
171
- """
172
- Concurrently launches all NIM embedding requests and flattens the results.
173
-
174
- Parameters
175
- ----------
176
- prompts : List[str]
177
- A list of prompt batches.
178
- api_key : str
179
- API key for authentication.
180
- embedding_nim_endpoint : str
181
- Base URL for the NIM embedding service.
182
- embedding_model : str
183
- The model to use for generating embeddings.
184
- encoding_format : str
185
- The desired encoding format.
186
- input_type : str
187
- The type of input data.
188
- truncate : str
189
- Truncation setting for the input data.
190
- filter_errors : bool
191
- Flag indicating whether to filter errors in the response.
192
-
193
- Returns
194
- -------
195
- dict
196
- A dictionary with keys "embeddings" (flattened embedding results) and "info_msgs" (error messages).
197
- """
198
- results = _async_request_handler(
199
- prompts,
200
- api_key,
201
- embedding_nim_endpoint,
202
- embedding_model,
203
- encoding_format,
204
- input_type,
205
- truncate,
206
- filter_errors,
207
- )
208
-
209
- flat_results = {"embeddings": [], "info_msgs": []}
210
- for batch_dict in results:
211
- info_msg = batch_dict["info_msg"]
212
- for embedding in batch_dict["embedding"]:
213
- if not isinstance(embedding, list):
214
- if embedding is not None:
215
- flat_results["embeddings"].append(embedding.embedding)
216
- else:
217
- flat_results["embeddings"].append(embedding)
218
- else:
219
- flat_results["embeddings"].append(embedding)
220
- flat_results["info_msgs"].append(info_msg)
221
-
222
- return flat_results
223
-
224
-
225
- # ------------------------------------------------------------------------------
226
- # Pandas UDFs for Content Extraction
227
- # ------------------------------------------------------------------------------
228
-
229
-
230
- def _add_embeddings(row, embeddings, info_msgs):
231
- """
232
- Updates a DataFrame row with embedding data and associated error info.
233
-
234
- Parameters
235
- ----------
236
- row : pandas.Series
237
- A row of the DataFrame.
238
- embeddings : list
239
- List of embeddings corresponding to DataFrame rows.
240
- info_msgs : list
241
- List of info message dictionaries corresponding to DataFrame rows.
242
-
243
- Returns
244
- -------
245
- pandas.Series
246
- The updated row with embedding and info message metadata added.
247
- """
248
- row["metadata"]["embedding"] = embeddings[row.name]
249
- if info_msgs[row.name] is not None:
250
- row["metadata"]["info_message_metadata"] = info_msgs[row.name]
251
- row["document_type"] = ContentTypeEnum.INFO_MSG
252
- row["_contains_embeddings"] = False
253
- else:
254
- row["_contains_embeddings"] = True
255
-
256
- return row
257
-
258
-
259
- def _get_pandas_text_content(row):
260
- """
261
- Extracts text content from a DataFrame row.
262
-
263
- Parameters
264
- ----------
265
- row : pandas.Series
266
- A row containing the 'content' key.
267
-
268
- Returns
269
- -------
270
- str
271
- The text content from the row.
272
- """
273
- return row["content"]
274
-
275
-
276
- def _get_pandas_table_content(row):
277
- """
278
- Extracts table/chart content from a DataFrame row.
279
-
280
- Parameters
281
- ----------
282
- row : pandas.Series
283
- A row containing 'table_metadata' with 'table_content'.
284
-
285
- Returns
286
- -------
287
- str
288
- The table/chart content from the row.
289
- """
290
- return row["table_metadata"]["table_content"]
291
-
292
-
293
- def _get_pandas_image_content(row):
294
- """
295
- Extracts image caption content from a DataFrame row.
296
-
297
- Parameters
298
- ----------
299
- row : pandas.Series
300
- A row containing 'image_metadata' with 'caption'.
301
-
302
- Returns
303
- -------
304
- str
305
- The image caption from the row.
306
- """
307
- return row["image_metadata"]["caption"]
308
-
309
-
310
- # ------------------------------------------------------------------------------
311
- # Batch Processing Utilities
312
- # ------------------------------------------------------------------------------
313
-
314
-
315
- def _batch_generator(iterable: Iterable, batch_size: int = 10):
316
- """
317
- Yields batches of a specified size from an iterable.
318
-
319
- Parameters
320
- ----------
321
- iterable : Iterable
322
- The iterable to batch.
323
- batch_size : int, optional
324
- The size of each batch (default is 10).
325
-
326
- Yields
327
- ------
328
- list
329
- A batch of items from the iterable.
330
- """
331
- iter_len = len(iterable)
332
- for idx in range(0, iter_len, batch_size):
333
- yield iterable[idx : min(idx + batch_size, iter_len)]
334
-
335
-
336
- def _generate_batches(prompts: List[str], batch_size: int = 100) -> List[str]:
337
- """
338
- Splits a list of prompts into batches.
339
-
340
- Parameters
341
- ----------
342
- prompts : List[str]
343
- The list of prompt strings.
344
- batch_size : int, optional
345
- The desired batch size (default is 100).
346
-
347
- Returns
348
- -------
349
- List[List[str]]
350
- A list of batches, each containing a subset of the prompts.
351
- """
352
- return [batch for batch in _batch_generator(prompts, batch_size)]
353
-
354
-
355
- def _get_pandas_audio_content(row):
356
- """
357
- A pandas UDF used to select extracted audio transcription to be used to create embeddings.
358
- """
359
- return row["audio_metadata"]["audio_transcript"]
360
-
361
-
362
- # ------------------------------------------------------------------------------
363
- # DataFrame Concatenation Utility
364
- # ------------------------------------------------------------------------------
365
-
366
-
367
- def _concatenate_extractions_pandas(
368
- base_df: pd.DataFrame, dataframes: List[pd.DataFrame], masks: List[pd.Series]
369
- ) -> pd.DataFrame:
370
- """
371
- Concatenates processed DataFrame rows (with embeddings) with unprocessed rows from the base DataFrame.
372
-
373
- Parameters
374
- ----------
375
- base_df : pd.DataFrame
376
- The original DataFrame.
377
- dataframes : List[pd.DataFrame]
378
- List of DataFrames that have been enriched with embeddings.
379
- masks : List[pd.Series]
380
- List of boolean masks indicating the rows that were processed.
381
-
382
- Returns
383
- -------
384
- pd.DataFrame
385
- The concatenated DataFrame with embeddings applied where available.
386
- """
387
- unified_mask = pd.Series(False, index=base_df.index)
388
- for mask in masks:
389
- unified_mask = unified_mask | mask
390
-
391
- df_no_text = base_df.loc[~unified_mask].copy()
392
- df_no_text["_contains_embeddings"] = False
393
-
394
- dataframes.append(df_no_text)
395
- combined_df = pd.concat(dataframes, axis=0, ignore_index=True).reset_index(drop=True)
396
- return combined_df
397
-
398
-
399
- # ------------------------------------------------------------------------------
400
- # Embedding Extraction Pipeline
401
- # ------------------------------------------------------------------------------
402
-
403
-
404
- def transform_create_text_embeddings_internal(
405
- df_transform_ledger: pd.DataFrame,
406
- task_config: Dict[str, Any],
407
- transform_config: TextEmbeddingSchema = TextEmbeddingSchema(),
408
- execution_trace_log: Optional[Dict] = None,
409
- ) -> Tuple[pd.DataFrame, Dict]:
410
- """
411
- Generates text embeddings for supported content types (TEXT, STRUCTURED, IMAGE)
412
- from a pandas DataFrame using asynchronous requests.
413
-
414
- Parameters
415
- ----------
416
- df_transform_ledger : pd.DataFrame
417
- The DataFrame containing content for embedding extraction.
418
- task_config : Dict[str, Any]
419
- Dictionary containing task properties (e.g., filter error flag).
420
- transform_config : Any
421
- Validated configuration for text embedding extraction (EmbedExtractionsSchema).
422
- execution_trace_log : Optional[Dict], optional
423
- Optional trace information for debugging or logging (default is None).
424
-
425
- Returns
426
- -------
427
- Tuple[pd.DataFrame, Dict]
428
- A tuple containing:
429
- - The updated DataFrame with embeddings applied.
430
- - A dictionary with trace information.
431
- """
432
- _ = task_config # Currently unused.
433
-
434
- if execution_trace_log is None:
435
- execution_trace_log = {}
436
- logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
437
-
438
- # TODO(Devin)
439
- if df_transform_ledger.empty:
440
- return df_transform_ledger, {"trace_info": execution_trace_log}
441
-
442
- embedding_dataframes = []
443
- content_masks = [] # List of pandas boolean Series
444
-
445
- # Define pandas content extractors for supported content types.
446
- pandas_content_extractor = {
447
- ContentTypeEnum.TEXT: _get_pandas_text_content,
448
- ContentTypeEnum.STRUCTURED: _get_pandas_table_content,
449
- ContentTypeEnum.IMAGE: _get_pandas_image_content,
450
- ContentTypeEnum.AUDIO: _get_pandas_audio_content,
451
- ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
452
- }
453
-
454
- logger.debug("Generating text embeddings for supported content types: TEXT, STRUCTURED, IMAGE.")
455
-
456
- # Process each supported content type.
457
- for content_type, content_getter in pandas_content_extractor.items():
458
- if not content_getter:
459
- logger.debug(f"Skipping unsupported content type: {content_type}")
460
- continue
461
-
462
- content_mask = df_transform_ledger["document_type"] == content_type.value
463
- if not content_mask.any():
464
- continue
465
-
466
- # Extract content from metadata and filter out rows with empty content.
467
- extracted_content = df_transform_ledger.loc[content_mask, "metadata"].apply(content_getter)
468
- non_empty_mask = extracted_content.notna() & (extracted_content.str.strip() != "")
469
- final_mask = content_mask & non_empty_mask
470
- if not final_mask.any():
471
- continue
472
-
473
- df_content = df_transform_ledger.loc[final_mask].copy().reset_index(drop=True)
474
- filtered_content = df_content["metadata"].apply(content_getter)
475
- filtered_content_batches = _generate_batches(filtered_content.tolist(), batch_size=transform_config.batch_size)
476
- content_embeddings = _async_runner(
477
- filtered_content_batches,
478
- transform_config.api_key,
479
- transform_config.embedding_nim_endpoint,
480
- transform_config.embedding_model,
481
- transform_config.encoding_format,
482
- transform_config.input_type,
483
- transform_config.truncate,
484
- False,
485
- )
486
- # Apply the embeddings (and any error info) to each row.
487
- df_content[["metadata", "document_type", "_contains_embeddings"]] = df_content.apply(
488
- _add_embeddings, **content_embeddings, axis=1
489
- )[["metadata", "document_type", "_contains_embeddings"]]
490
- df_content["_content"] = filtered_content
491
-
492
- embedding_dataframes.append(df_content)
493
- content_masks.append(final_mask)
494
-
495
- combined_df = _concatenate_extractions_pandas(df_transform_ledger, embedding_dataframes, content_masks)
496
- return combined_df, {"trace_info": execution_trace_log}