nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,219 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
+
8
+ import pandas as pd
9
+ from pydantic import BaseModel
10
+
11
+ from nv_ingest_api.internal.primitives.nim.model_interface.vlm import VLMModelInterface
12
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
13
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
14
+ from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
15
+ from nv_ingest_api.util.nim import create_inference_client
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _prepare_dataframes_mod(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
21
+ """
22
+ Prepares and returns three DataFrame-related objects from the input DataFrame.
23
+
24
+ The function performs the following:
25
+ 1. Checks if the DataFrame is empty or if the "document_type" column is missing.
26
+ In such a case, returns the original DataFrame, an empty DataFrame, and an empty boolean Series.
27
+ 2. Otherwise, it creates a boolean Series identifying rows where "document_type" equals IMAGE.
28
+ 3. Extracts a DataFrame containing only those rows.
29
+
30
+ Parameters
31
+ ----------
32
+ df : pd.DataFrame
33
+ The input DataFrame that should contain a "document_type" column.
34
+
35
+ Returns
36
+ -------
37
+ Tuple[pd.DataFrame, pd.DataFrame, pd.Series]
38
+ A tuple containing:
39
+ - The original DataFrame.
40
+ - A DataFrame filtered to rows where "document_type" is IMAGE.
41
+ - A boolean Series indicating which rows in the original DataFrame are IMAGE rows.
42
+ """
43
+ try:
44
+ if df.empty or "document_type" not in df.columns:
45
+ return df, pd.DataFrame(), pd.Series(dtype=bool)
46
+
47
+ bool_index: pd.Series = df["document_type"] == ContentTypeEnum.IMAGE
48
+ df_matched: pd.DataFrame = df.loc[bool_index]
49
+
50
+ return df, df_matched, bool_index
51
+
52
+ except Exception as e:
53
+ err_msg = f"_prepare_dataframes_mod: Error preparing dataframes. Original error: {e}"
54
+ logger.error(err_msg, exc_info=True)
55
+ raise type(e)(err_msg) from e
56
+
57
+
58
+ def _generate_captions(
59
+ base64_images: List[str],
60
+ prompt: str,
61
+ system_prompt: Optional[str],
62
+ api_key: str,
63
+ endpoint_url: str,
64
+ model_name: str,
65
+ ) -> List[str]:
66
+ """
67
+ Generates captions for a list of base64-encoded PNG images using the VLM model API.
68
+
69
+ This function performs the following steps:
70
+ 1. Scales each image to meet encoding size requirements using `scale_image_to_encoding_size`.
71
+ 2. Constructs the input payload containing the scaled images and the provided prompt.
72
+ 3. Creates an inference client using the VLMModelInterface.
73
+ 4. Calls the client's infer method to obtain a list of captions corresponding to the images.
74
+
75
+ Parameters
76
+ ----------
77
+ base64_images : List[str]
78
+ List of base64-encoded PNG image strings.
79
+ prompt : str
80
+ Text prompt to guide caption generation.
81
+ api_key : str
82
+ API key for authenticating with the VLM endpoint.
83
+ endpoint_url : str
84
+ URL of the VLM model HTTP endpoint.
85
+ model_name : str
86
+ The name of the model to use for inference.
87
+
88
+ Returns
89
+ -------
90
+ List[str]
91
+ A list of generated captions, each corresponding to an input image.
92
+
93
+ Raises
94
+ ------
95
+ Exception
96
+ Propagates any exception encountered during caption generation, with added context.
97
+ """
98
+ try:
99
+ # Scale each image to ensure it meets encoding size requirements.
100
+ scaled_images: List[str] = [scale_image_to_encoding_size(b64)[0] for b64 in base64_images]
101
+
102
+ # Build the input payload for the VLM model.
103
+ data: Dict[str, Any] = {
104
+ "base64_images": scaled_images,
105
+ "prompt": prompt,
106
+ }
107
+ if system_prompt:
108
+ data["system_prompt"] = system_prompt
109
+
110
+ # Create the inference client using the VLMModelInterface.
111
+ nim_client = create_inference_client(
112
+ model_interface=VLMModelInterface(),
113
+ endpoints=(None, endpoint_url),
114
+ auth_token=api_key,
115
+ infer_protocol="http",
116
+ )
117
+
118
+ # Perform inference to generate captions.
119
+ captions: List[str] = nim_client.infer(data, model_name=model_name)
120
+ return captions
121
+
122
+ except Exception as e:
123
+ err_msg = f"_generate_captions: Error generating captions: {e}"
124
+ logger.error(err_msg, exc_info=True)
125
+ raise type(e)(err_msg) from e
126
+
127
+
128
+ @unified_exception_handler
129
+ def transform_image_create_vlm_caption_internal(
130
+ df_transform_ledger: pd.DataFrame,
131
+ task_config: Union[BaseModel, Dict[str, Any]],
132
+ transform_config: Any,
133
+ execution_trace_log: Optional[Dict[str, Any]] = None,
134
+ ) -> pd.DataFrame:
135
+ """
136
+ Extracts and adds captions for image content in a DataFrame using the VLM model API.
137
+
138
+ This function updates the 'metadata' column for rows where the content type is "image".
139
+ It uses configuration values from task_config (or falls back to transform_config defaults)
140
+ to determine the API key, prompt, endpoint URL, and model name for caption generation.
141
+ The generated captions are added under the 'image_metadata.caption' key in the metadata.
142
+
143
+ Parameters
144
+ ----------
145
+ df_transform_ledger : pd.DataFrame
146
+ The input DataFrame containing image data. Each row must have a 'metadata' column
147
+ with at least the 'content' and 'content_metadata' keys.
148
+ task_config : Union[BaseModel, Dict[str, Any]]
149
+ Configuration parameters for caption extraction. If provided as a Pydantic model,
150
+ it will be converted to a dictionary. Expected keys include "api_key", "prompt",
151
+ "endpoint_url", and "model_name".
152
+ transform_config : Any
153
+ A configuration object providing default values for caption extraction. It should have
154
+ attributes: api_key, prompt, endpoint_url, and model_name.
155
+ execution_trace_log : Optional[Dict[str, Any]], default=None
156
+ Optional trace information for debugging or logging purposes.
157
+
158
+ Returns
159
+ -------
160
+ pd.DataFrame
161
+ The updated DataFrame with generated captions added to the 'image_metadata.caption' field
162
+ within the 'metadata' column for each image row.
163
+
164
+ Raises
165
+ ------
166
+ Exception
167
+ Propagates any exception encountered during the caption extraction process, with added context.
168
+ """
169
+
170
+ _ = execution_trace_log # Unused variable; placeholder to prevent linter warnings.
171
+
172
+ logger.debug("Attempting to caption image content")
173
+
174
+ # Convert task_config to dictionary if it is a Pydantic model.
175
+ if isinstance(task_config, BaseModel):
176
+ task_config = task_config.model_dump()
177
+
178
+ # Retrieve configuration values with fallback to transform_config defaults.
179
+ api_key: str = task_config.get("api_key") or transform_config.api_key
180
+ prompt: str = task_config.get("prompt") or transform_config.prompt
181
+ system_prompt: str = task_config.get("system_prompt") or transform_config.system_prompt
182
+ endpoint_url: str = task_config.get("endpoint_url") or transform_config.endpoint_url
183
+ model_name: str = task_config.get("model_name") or transform_config.model_name
184
+
185
+ # Create a mask for rows where the content type is "image".
186
+ df_mask: pd.Series = df_transform_ledger["metadata"].apply(
187
+ lambda meta: meta.get("content_metadata", {}).get("type") == "image"
188
+ )
189
+
190
+ # If no image rows exist, return the original DataFrame.
191
+ if not df_mask.any():
192
+ return df_transform_ledger
193
+
194
+ # Collect base64-encoded images from the rows where the content type is "image".
195
+ base64_images: List[str] = df_transform_ledger.loc[df_mask, "metadata"].apply(lambda meta: meta["content"]).tolist()
196
+
197
+ # Generate captions for the collected images.
198
+ captions: List[str] = _generate_captions(
199
+ base64_images,
200
+ prompt,
201
+ system_prompt,
202
+ api_key,
203
+ endpoint_url,
204
+ model_name,
205
+ )
206
+
207
+ # Update the DataFrame: assign each generated caption to the corresponding row.
208
+ for idx, caption in zip(df_transform_ledger.loc[df_mask].index, captions):
209
+ meta: Dict[str, Any] = df_transform_ledger.at[idx, "metadata"]
210
+ image_meta: Dict[str, Any] = meta.get("image_metadata", {})
211
+ image_meta["caption"] = caption
212
+ meta["image_metadata"] = image_meta
213
+ df_transform_ledger.at[idx, "metadata"] = meta
214
+
215
+ logger.debug("Image content captioning complete")
216
+ result, execution_trace_log = df_transform_ledger, {}
217
+ _ = execution_trace_log # Unused variable; placeholder to prevent linter warnings.
218
+
219
+ return result