nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,210 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import base64
6
+ import functools
7
+ import io
8
+ import logging
9
+ from typing import Any, Optional, Dict, Union, Tuple
10
+
11
+ import pandas as pd
12
+ from pydantic import BaseModel
13
+
14
+ from nv_ingest_api.internal.extract.pdf.engines.pdfium import pdfium_extractor
15
+ from nv_ingest_api.internal.extract.pptx.engines.pptx_helper import convert_stream_with_libreoffice
16
+ from nv_ingest_api.internal.extract.pptx.engines.pptx_helper import python_pptx
17
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def _prepare_task_properties(
23
+ base64_row: pd.Series, task_props: Union[Dict[str, Any], BaseModel]
24
+ ) -> Tuple[Dict[str, Any], Optional[str]]:
25
+ """
26
+ Prepare and return the task properties dictionary and source identifier from a DataFrame row.
27
+
28
+ This function converts task properties to a dictionary (if provided as a Pydantic model),
29
+ extracts row data (excluding the "content" field), and stores it under the "row_data" key within
30
+ the task properties. It also retrieves the "source_id" from the row if present.
31
+
32
+ Parameters
33
+ ----------
34
+ base64_row : pd.Series
35
+ A pandas Series representing a row containing base64-encoded content under the key "content"
36
+ and optionally a "source_id".
37
+ task_props : Union[Dict[str, Any], BaseModel]
38
+ A dictionary or Pydantic model containing extraction instructions and parameters.
39
+
40
+ Returns
41
+ -------
42
+ Tuple[Dict[str, Any], Optional[str]]
43
+ A tuple where the first element is the prepared task properties dictionary (with "row_data" added)
44
+ and the second element is the source_id if present; otherwise, None.
45
+ """
46
+ # If task_props is a Pydantic model, convert it to a dictionary.
47
+ if isinstance(task_props, BaseModel):
48
+ task_props = task_props.model_dump()
49
+ else:
50
+ task_props = dict(task_props)
51
+
52
+ # Exclude the "content" field from the row data.
53
+ row_data = base64_row.drop(labels=["content"], errors="ignore")
54
+ if "params" not in task_props:
55
+ task_props["params"] = {}
56
+ # Store the row data in the parameters.
57
+ task_props["params"]["row_data"] = row_data
58
+
59
+ # Retrieve the source identifier if available.
60
+ source_id = base64_row.get("source_id", None)
61
+ return task_props, source_id
62
+
63
+
64
+ @unified_exception_handler
65
+ def _decode_and_extract_from_pptx(
66
+ base64_row: pd.Series,
67
+ task_props: Union[Dict[str, Any], BaseModel],
68
+ extraction_config: Any,
69
+ trace_info: Dict[str, Any],
70
+ ) -> Any:
71
+ """
72
+ Decode base64-encoded PPTX content from a DataFrame row and extract data using the specified method.
73
+
74
+ The function prepares task properties (using `_prepare_task_properties`), decodes the base64 content
75
+ into a byte stream, determines extraction parameters, and calls the extraction function (e.g. `python_pptx`)
76
+ with the proper flags. If extraction fails, an exception tag is returned.
77
+
78
+ Parameters
79
+ ----------
80
+ base64_row : pd.Series
81
+ A Series containing base64-encoded PPTX content under the key "content" and optionally a "source_id".
82
+ task_props : Union[Dict[str, Any], BaseModel]
83
+ A dictionary or Pydantic model containing extraction instructions (may include a "method" key and "params").
84
+ extraction_config : Any
85
+ A configuration object containing PPTX extraction settings (e.g. `pptx_extraction_config`).
86
+ trace_info : Dict[str, Any]
87
+ A dictionary with trace information for logging or debugging.
88
+
89
+ Returns
90
+ -------
91
+ Any
92
+ The extracted data from the PPTX file, or an exception tag indicating failure.
93
+ """
94
+ # Prepare task properties and extract source_id.
95
+ prepared_task_props, source_id = _prepare_task_properties(base64_row, task_props)
96
+
97
+ # Decode base64 content into bytes and create a BytesIO stream.
98
+ base64_content: str = base64_row["content"]
99
+ pptx_bytes: bytes = base64.b64decode(base64_content)
100
+ pptx_stream: io.BytesIO = io.BytesIO(pptx_bytes)
101
+
102
+ # Retrieve extraction parameters (and remove boolean flags as they are consumed).
103
+ extract_method = prepared_task_props.get("method", "python_pptx")
104
+ extract_params: Dict[str, Any] = prepared_task_props.get("params", {})
105
+ extract_text: bool = extract_params.pop("extract_text", False)
106
+ extract_images: bool = extract_params.pop("extract_images", False)
107
+ extract_tables: bool = extract_params.pop("extract_tables", False)
108
+ extract_charts: bool = extract_params.pop("extract_charts", False)
109
+ extract_infographics: bool = extract_params.pop("extract_infographics", False)
110
+
111
+ # Inject additional configuration and trace information.
112
+ if getattr(extraction_config, "pptx_extraction_config", None) is not None:
113
+ extract_params["pptx_extraction_config"] = extraction_config.pptx_extraction_config
114
+ if trace_info is not None:
115
+ extract_params["trace_info"] = trace_info
116
+
117
+ if extract_method == "render_as_pdf":
118
+ pdf_stream = convert_stream_with_libreoffice(pptx_stream, "pptx", "pdf")
119
+
120
+ pdf_extract_method = extract_params.get("pdf_extract_method", "pdfium")
121
+ pdf_extractor_config = extract_params.copy()
122
+ pdf_extractor_config["extract_method"] = pdf_extract_method
123
+ if getattr(extraction_config, "pdfium_config", None) is not None:
124
+ pdf_extractor_config["pdfium_config"] = extraction_config.pdfium_config
125
+
126
+ extracted_data: Any = pdfium_extractor(
127
+ pdf_stream=pdf_stream,
128
+ extract_text=extract_text,
129
+ extract_images=extract_images,
130
+ extract_infographics=extract_infographics,
131
+ extract_tables=extract_tables,
132
+ extract_charts=extract_charts,
133
+ extract_page_as_image=False,
134
+ extractor_config=pdf_extractor_config,
135
+ execution_trace_log=None,
136
+ )
137
+ elif extract_method == "python_pptx":
138
+ # Call the PPTX extraction function.
139
+ extracted_data = python_pptx(
140
+ pptx_stream=pptx_stream,
141
+ extract_text=extract_text,
142
+ extract_images=extract_images,
143
+ extract_infographics=extract_infographics,
144
+ extract_tables=extract_tables,
145
+ extract_charts=extract_charts,
146
+ extraction_config=extract_params,
147
+ execution_trace_log=None,
148
+ )
149
+ else:
150
+ raise ValueError(f"Unsupported PPTx extraction method: {extract_method}")
151
+
152
+ return extracted_data
153
+
154
+
155
+ @unified_exception_handler
156
+ def extract_primitives_from_pptx_internal(
157
+ df_extraction_ledger: pd.DataFrame,
158
+ task_config: Union[Dict[str, Any], BaseModel],
159
+ extraction_config: Any, # Assuming PPTXExtractorSchema or similar type
160
+ execution_trace_log: Optional[Dict[str, Any]] = None,
161
+ ) -> pd.DataFrame:
162
+ """
163
+ Process a DataFrame containing base64-encoded PPTX files and extract primitive data.
164
+
165
+ This function applies a decoding and extraction routine to each row of the DataFrame
166
+ (via `_decode_and_extract_from_pptx`), then explodes any list results into separate rows, drops missing values,
167
+ and compiles the extracted data into a new DataFrame. The resulting DataFrame includes columns for document type,
168
+ extracted metadata, and a unique identifier (UUID).
169
+
170
+ Parameters
171
+ ----------
172
+ df_extraction_ledger : pd.DataFrame
173
+ Input DataFrame with PPTX files in base64 encoding. Expected to include columns 'source_id' and 'content'.
174
+ task_config : Union[Dict[str, Any], BaseModel]
175
+ Configuration for the PPTX extraction task, as a dict or Pydantic model.
176
+ extraction_config : Any
177
+ Configuration object for PPTX extraction (e.g. PPTXExtractorSchema).
178
+ execution_trace_log : Optional[Dict[str, Any]], optional
179
+ Optional dictionary containing trace information for debugging.
180
+
181
+ Returns
182
+ -------
183
+ pd.DataFrame
184
+ DataFrame with extracted PPTX content containing columns:
185
+ "document_type", "metadata", and "uuid".
186
+
187
+ Raises
188
+ ------
189
+ Exception
190
+ Reraises any exception encountered during extraction with additional context.
191
+ """
192
+ # Create a partial function to decode and extract content from each DataFrame row.
193
+ decode_and_extract_partial = functools.partial(
194
+ _decode_and_extract_from_pptx,
195
+ task_props=task_config,
196
+ extraction_config=extraction_config,
197
+ trace_info=execution_trace_log,
198
+ )
199
+ # Apply the decoding and extraction to each row.
200
+ extraction_series = df_extraction_ledger.apply(decode_and_extract_partial, axis=1)
201
+ # Explode list results into separate rows and remove missing values.
202
+ extraction_series = extraction_series.explode().dropna()
203
+
204
+ # Convert the series into a DataFrame with defined columns.
205
+ if not extraction_series.empty:
206
+ extracted_df = pd.DataFrame(extraction_series.to_list(), columns=["document_type", "metadata", "uuid"])
207
+ else:
208
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
209
+
210
+ return extracted_df, {}
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,232 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import hashlib
6
+ import inspect
7
+ import logging
8
+ import time
9
+ from typing import Any, Dict, List, Optional
10
+ from dataclasses import dataclass
11
+
12
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_all_tasks_by_type
13
+ from nv_ingest_api.internal.schemas.meta.udf import UDFStageSchema
14
+ from nv_ingest_api.util.imports.callable_signatures import ingest_callable_signature
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @dataclass
20
+ class CachedUDF:
21
+ """Cached UDF function with metadata"""
22
+
23
+ function: callable
24
+ function_name: str
25
+ signature_validated: bool
26
+ created_at: float
27
+ last_used: float
28
+ use_count: int
29
+
30
+
31
+ class UDFCache:
32
+ """LRU cache for compiled and validated UDF functions"""
33
+
34
+ def __init__(self, max_size: int = 128, ttl_seconds: Optional[int] = 3600):
35
+ self.max_size = max_size
36
+ self.ttl_seconds = ttl_seconds
37
+ self.cache: Dict[str, CachedUDF] = {}
38
+ self.access_order: List[str] = [] # For LRU tracking
39
+
40
+ def _generate_cache_key(self, udf_function_str: str, udf_function_name: str) -> str:
41
+ """Generate cache key from UDF string and function name"""
42
+ content = f"{udf_function_str.strip()}:{udf_function_name}"
43
+ return hashlib.sha256(content.encode()).hexdigest()
44
+
45
+ def _evict_lru(self):
46
+ """Remove least recently used item"""
47
+ if self.access_order:
48
+ lru_key = self.access_order.pop(0)
49
+ self.cache.pop(lru_key, None)
50
+
51
+ def _cleanup_expired(self):
52
+ """Remove expired entries if TTL is configured"""
53
+ if not self.ttl_seconds:
54
+ return
55
+
56
+ current_time = time.time()
57
+ expired_keys = [
58
+ key for key, cached_udf in self.cache.items() if current_time - cached_udf.created_at > self.ttl_seconds
59
+ ]
60
+
61
+ for key in expired_keys:
62
+ self.cache.pop(key, None)
63
+ if key in self.access_order:
64
+ self.access_order.remove(key)
65
+
66
+ def get(self, udf_function_str: str, udf_function_name: str) -> Optional[CachedUDF]:
67
+ """Get cached UDF function if available"""
68
+ self._cleanup_expired()
69
+
70
+ cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
71
+
72
+ if cache_key in self.cache:
73
+ # Update access tracking
74
+ if cache_key in self.access_order:
75
+ self.access_order.remove(cache_key)
76
+ self.access_order.append(cache_key)
77
+
78
+ # Update usage stats
79
+ cached_udf = self.cache[cache_key]
80
+ cached_udf.last_used = time.time()
81
+ cached_udf.use_count += 1
82
+
83
+ return cached_udf
84
+
85
+ return None
86
+
87
+ def put(
88
+ self, udf_function_str: str, udf_function_name: str, function: callable, signature_validated: bool = True
89
+ ) -> str:
90
+ """Cache a compiled and validated UDF function"""
91
+ cache_key = self._generate_cache_key(udf_function_str, udf_function_name)
92
+
93
+ # Evict LRU if at capacity
94
+ while len(self.cache) >= self.max_size:
95
+ self._evict_lru()
96
+
97
+ current_time = time.time()
98
+ cached_udf = CachedUDF(
99
+ function=function,
100
+ function_name=udf_function_name,
101
+ signature_validated=signature_validated,
102
+ created_at=current_time,
103
+ last_used=current_time,
104
+ use_count=1,
105
+ )
106
+
107
+ self.cache[cache_key] = cached_udf
108
+ self.access_order.append(cache_key)
109
+
110
+ return cache_key
111
+
112
+ def get_stats(self) -> Dict[str, Any]:
113
+ """Get cache statistics"""
114
+ total_uses = sum(udf.use_count for udf in self.cache.values())
115
+ most_used = max(self.cache.values(), key=lambda x: x.use_count, default=None)
116
+ return {
117
+ "size": len(self.cache),
118
+ "max_size": self.max_size,
119
+ "total_uses": total_uses,
120
+ "most_used_function": most_used.function_name if most_used else None,
121
+ "most_used_count": most_used.use_count if most_used else 0,
122
+ }
123
+
124
+
125
+ # Global cache instance
126
+ _udf_cache = UDFCache(max_size=128, ttl_seconds=3600)
127
+
128
+
129
+ def compile_and_validate_udf(udf_function_str: str, udf_function_name: str, task_num: int) -> callable:
130
+ """Compile and validate UDF function (extracted for caching)"""
131
+ # Execute the UDF function string in a controlled namespace
132
+ namespace: Dict[str, Any] = {}
133
+ try:
134
+ exec(udf_function_str, namespace)
135
+ except Exception as e:
136
+ raise ValueError(f"UDF task {task_num} failed to execute: {str(e)}")
137
+
138
+ # Extract the specified function from the namespace
139
+ if udf_function_name in namespace and callable(namespace[udf_function_name]):
140
+ udf_function = namespace[udf_function_name]
141
+ else:
142
+ raise ValueError(f"UDF task {task_num}: Specified UDF function '{udf_function_name}' not found or not callable")
143
+
144
+ # Validate the UDF function signature
145
+ try:
146
+ ingest_callable_signature(inspect.signature(udf_function))
147
+ except Exception as e:
148
+ raise ValueError(f"UDF task {task_num} has invalid function signature: {str(e)}")
149
+
150
+ return udf_function
151
+
152
+
153
+ def get_udf_cache_stats() -> Dict[str, Any]:
154
+ """Get UDF cache performance statistics"""
155
+ return _udf_cache.get_stats()
156
+
157
+
158
+ def udf_stage_callable_fn(control_message: IngestControlMessage, stage_config: UDFStageSchema) -> IngestControlMessage:
159
+ """
160
+ UDF stage callable function that processes UDF tasks in a control message.
161
+
162
+ This function extracts all UDF tasks from the control message and executes them sequentially.
163
+
164
+ Parameters
165
+ ----------
166
+ control_message : IngestControlMessage
167
+ The control message containing UDF tasks to process
168
+ stage_config : UDFStageSchema
169
+ Configuration for the UDF stage
170
+
171
+ Returns
172
+ -------
173
+ IngestControlMessage
174
+ The control message after processing all UDF tasks
175
+ """
176
+ logger.debug("Starting UDF stage processing")
177
+
178
+ # Extract all UDF tasks from control message using free function
179
+ try:
180
+ all_task_configs = remove_all_tasks_by_type(control_message, "udf")
181
+ except ValueError:
182
+ # No UDF tasks found
183
+ if stage_config.ignore_empty_udf:
184
+ logger.debug("No UDF tasks found, ignoring as configured")
185
+ return control_message
186
+ else:
187
+ raise ValueError("No UDF tasks found in control message")
188
+
189
+ # Process each UDF task sequentially
190
+ for task_num, task_config in enumerate(all_task_configs, 1):
191
+ logger.debug(f"Processing UDF task {task_num} of {len(all_task_configs)}")
192
+
193
+ # Get UDF function string and function name from task properties
194
+ udf_function_str = task_config.get("udf_function", "").strip()
195
+ udf_function_name = task_config.get("udf_function_name", "").strip()
196
+
197
+ # Skip empty UDF functions if configured to ignore them
198
+ if not udf_function_str:
199
+ if stage_config.ignore_empty_udf:
200
+ logger.debug(f"UDF task {task_num} has empty function, skipping as configured")
201
+ continue
202
+ else:
203
+ raise ValueError(f"UDF task {task_num} has empty function string")
204
+
205
+ # Validate that function name is provided
206
+ if not udf_function_name:
207
+ raise ValueError(f"UDF task {task_num} missing required 'udf_function_name' property")
208
+
209
+ # Check if UDF function is cached
210
+ cached_udf = _udf_cache.get(udf_function_str, udf_function_name)
211
+ if cached_udf:
212
+ udf_function = cached_udf.function
213
+ else:
214
+ # Compile and validate UDF function
215
+ udf_function = compile_and_validate_udf(udf_function_str, udf_function_name, task_num)
216
+ # Cache the compiled UDF function
217
+ _udf_cache.put(udf_function_str, udf_function_name, udf_function)
218
+
219
+ # Execute the UDF function with the control message
220
+ try:
221
+ control_message = udf_function(control_message)
222
+ except Exception as e:
223
+ raise ValueError(f"UDF task {task_num} execution failed: {str(e)}")
224
+
225
+ # Validate that the UDF function returned an IngestControlMessage
226
+ if not isinstance(control_message, IngestControlMessage):
227
+ raise ValueError(f"UDF task {task_num} must return an IngestControlMessage, got {type(control_message)}")
228
+
229
+ logger.debug(f"UDF task {task_num} completed successfully")
230
+
231
+ logger.debug(f"UDF stage processing completed. Processed {len(all_task_configs)} UDF tasks")
232
+ return control_message
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,110 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import hashlib
7
+ from typing import Any, Dict, Optional, List
8
+
9
+ import pandas as pd
10
+
11
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
12
+ from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def _hash_content(x: Any, algorithm: str = "md5") -> bytes:
18
+ """
19
+ Compute a hash of the content using the specified algorithm.
20
+
21
+ Parameters
22
+ ----------
23
+ x : dict
24
+ A dictionary containing the content under the key "content".
25
+ algorithm : str, optional
26
+ Hashing algorithm to use (default "md5").
27
+
28
+ Returns
29
+ -------
30
+ bytes
31
+ The computed hash.
32
+ """
33
+ try:
34
+ return hashlib.new(algorithm, x["content"].encode()).digest()
35
+ except Exception as e:
36
+ msg = f"hash_content: Error computing hash: {e}"
37
+ logger.error(msg, exc_info=True)
38
+ raise type(e)(msg) from e
39
+
40
+
41
+ def deduplicate_images_internal(
42
+ df_ledger: pd.DataFrame,
43
+ task_config: Dict[str, Any],
44
+ mutate_config: ImageDedupSchema = ImageDedupSchema(),
45
+ execution_trace_log: Optional[List[Any]] = None,
46
+ ) -> pd.DataFrame:
47
+ """
48
+ Deduplicate images in a DataFrame based on content hashes.
49
+
50
+ The function processes rows where the 'document_type' is IMAGE, computes a content hash for each,
51
+ and then either removes duplicates or marks them based on the 'filter' flag in task_config.
52
+ A 'hash_algorithm' flag in task_config determines the algorithm used for hashing.
53
+
54
+ Parameters
55
+ ----------
56
+ df_ledger : pd.DataFrame
57
+ DataFrame containing at least 'document_type' and 'metadata' columns.
58
+ task_config : dict
59
+ Configuration parameters, including:
60
+ - "filter": bool, if True duplicate rows are removed; if False, duplicates are marked.
61
+ - "hash_algorithm": str, the algorithm to use for hashing (default "md5").
62
+ mutate_config : ImageDedupSchema, optional
63
+ execution_trace_log : Optional[List[Any]], optional
64
+
65
+ Returns
66
+ -------
67
+ pd.DataFrame
68
+ The DataFrame with duplicate images either removed or marked.
69
+
70
+ Raises
71
+ ------
72
+ ValueError
73
+ If the required columns are missing.
74
+ Exception
75
+ For any other errors encountered during deduplication.
76
+ """
77
+
78
+ _ = mutate_config # Unused variable
79
+ _ = execution_trace_log # TODO(Devin): Implement trace logging
80
+
81
+ try:
82
+ # Verify required columns exist.
83
+ for col in ("document_type", "metadata"):
84
+ if col not in df_ledger.columns:
85
+ raise ValueError(f"Missing required column '{col}'.")
86
+
87
+ # Select image rows.
88
+ image_mask = df_ledger["document_type"] == ContentTypeEnum.IMAGE
89
+ if not image_mask.any():
90
+ return df_ledger[~image_mask]
91
+
92
+ df_images = df_ledger.loc[image_mask].copy()
93
+ hash_algorithm = task_config.get("hash_algorithm", "md5")
94
+
95
+ # Compute content hash for each image.
96
+ df_images["_image_content_hash"] = df_images["metadata"].apply(_hash_content, args=(hash_algorithm,))
97
+ df_images_deduped = df_images.drop_duplicates(subset="_image_content_hash")
98
+ deduped_indices = df_images_deduped.index
99
+
100
+ non_image_rows = df_ledger.loc[~image_mask]
101
+ deduped_images = df_images.loc[deduped_indices][df_ledger.columns.difference(["_image_content_hash"])]
102
+
103
+ result, execution_trace_log = pd.concat([deduped_images, non_image_rows], axis=0), {}
104
+ _ = execution_trace_log
105
+
106
+ return result
107
+ except Exception as e:
108
+ msg = f"deduplicate_images_internal: Error applying deduplication filter: {e}"
109
+ logger.error(msg, exc_info=True)
110
+ raise type(e)(msg) from e
@@ -0,0 +1,133 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import Dict, Optional, List, Any
7
+
8
+ import pandas as pd
9
+
10
+ from nv_ingest_api.internal.enums.common import TaskTypeEnum
11
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import (
12
+ ContentTypeEnum,
13
+ InfoMessageMetadataSchema,
14
+ StatusEnum,
15
+ )
16
+ from nv_ingest_api.internal.schemas.transform.transform_image_filter_schema import ImageFilterSchema
17
+ from nv_ingest_api.util.schema.schema_validator import validate_schema
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def _add_info_message(x, info_msg):
23
+ x["info_message_metadata"] = info_msg
24
+
25
+ return x
26
+
27
+
28
+ def _calculate_average_image_size(x):
29
+ return (x["image_metadata"]["width"] + x["image_metadata"]["height"]) / 2
30
+
31
+
32
+ def _calculate_aspect_ratio(x):
33
+ return x["image_metadata"]["width"] / max(x["image_metadata"]["height"], 1e-9)
34
+
35
+
36
+ def filter_images_internal(
37
+ df_ledger: pd.DataFrame,
38
+ task_config: Dict[str, Any],
39
+ mutate_config: ImageFilterSchema = ImageFilterSchema(),
40
+ execution_trace_log: Optional[List[Any]] = None,
41
+ ) -> pd.DataFrame:
42
+ """
43
+ Apply an image filtering operation to a DataFrame based on average image size and aspect ratio.
44
+
45
+ Parameters
46
+ ----------
47
+ df_ledger : pd.DataFrame
48
+ DataFrame to be filtered. Must contain 'document_type' and 'metadata' columns.
49
+ task_config : dict
50
+ Dictionary with the following keys:
51
+ - "min_size": Minimum average image size threshold.
52
+ - "max_aspect_ratio": Maximum allowed aspect ratio.
53
+ - "min_aspect_ratio": Minimum allowed aspect ratio.
54
+ - "filter": If True, rows failing the criteria are dropped; if False, they are flagged.
55
+ mutate_config : ImageFilterSchema
56
+ execution_trace_log : Optional[List[Any]], optional
57
+
58
+ Returns
59
+ -------
60
+ pd.DataFrame
61
+ The updated DataFrame after applying the image filter.
62
+
63
+ Raises
64
+ ------
65
+ ValueError
66
+ If required columns are missing or if parameters are invalid.
67
+ Exception
68
+ For other errors encountered during filtering.
69
+ """
70
+
71
+ _ = mutate_config # Unused variable
72
+ _ = execution_trace_log # TODO(Devin)
73
+
74
+ try:
75
+ required_columns = {"document_type", "metadata"}
76
+ if not required_columns.issubset(df_ledger.columns):
77
+ raise ValueError(f"DataFrame must contain columns: {required_columns}")
78
+
79
+ min_size = task_config.get("min_size")
80
+ max_aspect_ratio = task_config.get("max_aspect_ratio")
81
+ min_aspect_ratio = task_config.get("min_aspect_ratio")
82
+ filter_flag = task_config.get("filter", True)
83
+
84
+ if not isinstance(min_size, (int, float)) or min_size < 0:
85
+ raise ValueError("min_size must be a non-negative number")
86
+ if not isinstance(max_aspect_ratio, (int, float)) or max_aspect_ratio <= 0:
87
+ raise ValueError("max_aspect_ratio must be a positive number")
88
+ if not isinstance(min_aspect_ratio, (int, float)) or min_aspect_ratio <= 0:
89
+ raise ValueError("min_aspect_ratio must be a positive number")
90
+ if min_aspect_ratio > max_aspect_ratio:
91
+ raise ValueError("min_aspect_ratio cannot be greater than max_aspect_ratio")
92
+
93
+ image_mask = df_ledger["document_type"] == ContentTypeEnum.IMAGE
94
+ if not image_mask.any():
95
+ return df_ledger.copy()
96
+
97
+ df_image = df_ledger.loc[image_mask].copy()
98
+ avg_size = df_image["metadata"].apply(_calculate_average_image_size)
99
+ avg_size_mask = avg_size > min_size
100
+
101
+ aspect_ratio = df_image["metadata"].apply(_calculate_aspect_ratio)
102
+ min_aspect_ratio_mask = aspect_ratio > min_aspect_ratio
103
+ max_aspect_ratio_mask = aspect_ratio < max_aspect_ratio
104
+
105
+ valid_mask = avg_size_mask & min_aspect_ratio_mask & max_aspect_ratio_mask
106
+ image_filter_mask = ~valid_mask
107
+
108
+ if image_filter_mask.any():
109
+ filtered_df = df_image.loc[image_filter_mask].copy()
110
+ if filter_flag:
111
+ df_ledger.drop(labels=filtered_df.index, inplace=True)
112
+ return df_ledger
113
+
114
+ info_msg = {
115
+ "task": TaskTypeEnum.FILTER.value,
116
+ "status": StatusEnum.SUCCESS.value,
117
+ "message": "Filtered due to image size or aspect ratio.",
118
+ "filter": True,
119
+ }
120
+ validated_info_msg = validate_schema(info_msg, InfoMessageMetadataSchema).model_dump()
121
+ filtered_df["info_message_metadata"] = [validated_info_msg] * filtered_df.shape[0]
122
+ filtered_df["metadata"] = filtered_df["metadata"].apply(_add_info_message, args=(info_msg,))
123
+ df_ledger.loc[filtered_df.index, "metadata"] = filtered_df["metadata"]
124
+ df_ledger.loc[filtered_df.index, "document_type"] = ContentTypeEnum.INFO_MSG
125
+
126
+ result, execution_trace_log = df_ledger, {}
127
+
128
+ return result
129
+
130
+ except Exception as e:
131
+ err_msg = f"filter_images_internal: Error applying image filter. Original error: {e}"
132
+ logger.error(err_msg, exc_info=True)
133
+ raise type(e)(err_msg) from e