nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.23.dev20250423__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.23.dev20250423.dist-info/RECORD +152 -0
  149. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/WHEEL +1 -1
  150. nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
  151. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  152. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,56 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from typing import Tuple, Optional
6
+
7
+ from nv_ingest_api.internal.primitives.nim.nim_client import NimClient
8
+ from nv_ingest_api.internal.primitives.nim.nim_model_interface import ModelInterface
9
+
10
+ __all__ = ["create_inference_client"]
11
+
12
+
13
+ def create_inference_client(
14
+ endpoints: Tuple[str, str],
15
+ model_interface: ModelInterface,
16
+ auth_token: Optional[str] = None,
17
+ infer_protocol: Optional[str] = None,
18
+ timeout: float = 120.0,
19
+ max_retries: int = 5,
20
+ ) -> NimClient:
21
+ """
22
+ Create a NimClient for interfacing with a model inference server.
23
+
24
+ Parameters
25
+ ----------
26
+ endpoints : tuple
27
+ A tuple containing the gRPC and HTTP endpoints.
28
+ model_interface : ModelInterface
29
+ The model interface implementation to use.
30
+ auth_token : str, optional
31
+ Authorization token for HTTP requests (default: None).
32
+ infer_protocol : str, optional
33
+ The protocol to use ("grpc" or "http"). If not specified, it is inferred from the endpoints.
34
+
35
+ Returns
36
+ -------
37
+ NimClient
38
+ The initialized NimClient.
39
+
40
+ Raises
41
+ ------
42
+ ValueError
43
+ If an invalid infer_protocol is specified.
44
+ """
45
+
46
+ grpc_endpoint, http_endpoint = endpoints
47
+
48
+ if (infer_protocol is None) and (grpc_endpoint and grpc_endpoint.strip()):
49
+ infer_protocol = "grpc"
50
+ elif infer_protocol is None and http_endpoint:
51
+ infer_protocol = "http"
52
+
53
+ if infer_protocol not in ["grpc", "http"]:
54
+ raise ValueError("Invalid infer_protocol specified. Must be 'grpc' or 'http'.")
55
+
56
+ return NimClient(model_interface, infer_protocol, endpoints, auth_token, timeout, max_retries)
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,427 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ from typing import List, Any
7
+ from typing import Optional
8
+ from typing import Tuple
9
+
10
+ import PIL
11
+ import numpy as np
12
+ import pypdfium2 as pdfium
13
+ import pypdfium2.raw as pdfium_c
14
+ from numpy import dtype
15
+ from numpy import ndarray
16
+
17
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable_func
18
+ from nv_ingest_api.util.image_processing.clustering import (
19
+ group_bounding_boxes,
20
+ combine_groups_into_bboxes,
21
+ remove_superset_bboxes,
22
+ )
23
+ from nv_ingest_api.util.image_processing.transforms import pad_image, numpy_to_base64, crop_image
24
+ from nv_ingest_api.util.metadata.aggregators import Base64Image
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ PDFIUM_PAGEOBJ_MAPPING = {
29
+ pdfium_c.FPDF_PAGEOBJ_TEXT: "TEXT",
30
+ pdfium_c.FPDF_PAGEOBJ_PATH: "PATH",
31
+ pdfium_c.FPDF_PAGEOBJ_IMAGE: "IMAGE",
32
+ pdfium_c.FPDF_PAGEOBJ_SHADING: "SHADING",
33
+ pdfium_c.FPDF_PAGEOBJ_FORM: "FORM",
34
+ }
35
+
36
+
37
+ def convert_bitmap_to_corrected_numpy(bitmap: pdfium.PdfBitmap) -> np.ndarray:
38
+ """
39
+ Converts a PdfBitmap to a correctly formatted NumPy array, handling any necessary
40
+ channel swapping based on the bitmap's mode.
41
+
42
+ Parameters
43
+ ----------
44
+ bitmap : pdfium.PdfBitmap
45
+ The bitmap object rendered from a PDF page.
46
+
47
+ Returns
48
+ -------
49
+ np.ndarray
50
+ A NumPy array representing the correctly formatted image data.
51
+ """
52
+ mode = bitmap.mode # Use the mode to identify the correct format
53
+
54
+ # Convert to a NumPy array using the built-in method
55
+ img_arr = bitmap.to_numpy().copy()
56
+
57
+ # Automatically handle channel swapping if necessary
58
+ if mode in {"BGRA", "BGRX"}:
59
+ img_arr = img_arr[..., [2, 1, 0, 3]] # Swap BGR(A) to RGB(A)
60
+ elif mode == "BGR":
61
+ img_arr = img_arr[..., [2, 1, 0]] # Swap BGR to RGB
62
+
63
+ return img_arr
64
+
65
+
66
+ def pdfium_try_get_bitmap_as_numpy(image_obj) -> np.ndarray:
67
+ """
68
+ Attempts to retrieve the bitmap from a PdfImage object and convert it to a NumPy array,
69
+ first with rendering enabled and then without rendering if the first attempt fails.
70
+
71
+ Parameters
72
+ ----------
73
+ image_obj : PdfImage
74
+ The PdfImage object from which to extract the bitmap.
75
+
76
+ Returns
77
+ -------
78
+ np.ndarray
79
+ The extracted bitmap as a NumPy array.
80
+
81
+ Raises
82
+ ------
83
+ PdfiumError
84
+ If an exception occurs during bitmap retrieval and both attempts fail.
85
+
86
+ Notes
87
+ -----
88
+ This function first tries to retrieve the bitmap with rendering enabled (`render=True`).
89
+ If that fails or the bitmap returned is `None`, it attempts to retrieve the raw bitmap
90
+ without rendering (`render=False`).
91
+ Any errors encountered during these attempts are logged at the debug level.
92
+ """
93
+ image_bitmap = None
94
+
95
+ # First attempt with rendering enabled
96
+ try:
97
+ # logger.debug("Attempting to get rendered bitmap.")
98
+ image_bitmap = image_obj.get_bitmap(render=True)
99
+ except pdfium.PdfiumError as e:
100
+ logger.debug(f"Failed to get rendered bitmap: {e}")
101
+
102
+ # If rendering failed or returned None, try without rendering
103
+ if image_bitmap is None:
104
+ try:
105
+ # logger.debug("Attempting to get raw bitmap without rendering.")
106
+ image_bitmap = image_obj.get_bitmap(render=False)
107
+ except pdfium.PdfiumError as e:
108
+ logger.debug(f"Failed to get raw bitmap: {e}")
109
+ raise # Re-raise the exception to ensure the failure is handled upstream
110
+
111
+ # Final check if bitmap is still None
112
+ if image_bitmap is None:
113
+ logger.debug("Failed to obtain bitmap from the image object after both attempts.")
114
+ raise ValueError("Failed to retrieve bitmap from the PdfImage object.")
115
+
116
+ # Convert the bitmap to a NumPy array
117
+ img_array = convert_bitmap_to_corrected_numpy(image_bitmap)
118
+
119
+ return img_array
120
+
121
+
122
+ @traceable_func(trace_name="pdf_content_extractor::pdfium_pages_to_numpy")
123
+ def pdfium_pages_to_numpy(
124
+ pages: List[pdfium.PdfPage],
125
+ render_dpi: int = 300,
126
+ scale_tuple: Optional[Tuple[int, int]] = None,
127
+ padding_tuple: Optional[Tuple[int, int]] = None,
128
+ rotation: int = 0,
129
+ ) -> tuple[list[ndarray | ndarray[Any, dtype[Any]]], list[tuple[int, int]]]:
130
+ """
131
+ Converts a list of PdfPage objects to a list of NumPy arrays, where each array
132
+ represents an image of the corresponding PDF page.
133
+
134
+ The function renders each page as a bitmap, converts it to a PIL image, applies any
135
+ specified scaling using the thumbnail approach, and adds padding if requested. The
136
+ DPI for rendering can be specified, with a default value of 300 DPI.
137
+
138
+ Parameters
139
+ ----------
140
+ pages : List[pdfium.PdfPage]
141
+ A list of PdfPage objects to be rendered and converted into NumPy arrays.
142
+ render_dpi : int, optional
143
+ The DPI (dots per inch) at which to render the pages. Must be between 50 and 1200.
144
+ Defaults to 300.
145
+ scale_tuple : Optional[Tuple[int, int]], optional
146
+ A tuple (width, height) to resize the rendered image to using the thumbnail approach.
147
+ Defaults to None.
148
+ padding_tuple : Optional[Tuple[int, int]], optional
149
+ A tuple (width, height) to pad the image to. Defaults to None.
150
+ rotation:
151
+
152
+ Returns
153
+ -------
154
+ tuple
155
+ A tuple containing:
156
+ - A list of NumPy arrays, where each array corresponds to an image of a PDF page.
157
+ Each array is an independent copy of the rendered image data.
158
+ - A list of padding offsets applied to each image, as tuples of (offset_width, offset_height).
159
+
160
+ Raises
161
+ ------
162
+ ValueError
163
+ If the render_dpi is outside the allowed range (50-1200).
164
+ PdfiumError
165
+ If there is an issue rendering the page or converting it to a NumPy array.
166
+ IOError
167
+ If there is an error saving the image to disk.
168
+ """
169
+ if not (50 <= render_dpi <= 1200):
170
+ raise ValueError("render_dpi must be between 50 and 1200.")
171
+
172
+ images = []
173
+ padding_offsets = []
174
+ scale = render_dpi / 72 # 72 DPI is the base DPI in PDFium
175
+
176
+ for idx, page in enumerate(pages):
177
+ # Render the page as a bitmap with the specified scale and rotation
178
+ page_bitmap = page.render(scale=scale, rotation=rotation)
179
+
180
+ # Convert the bitmap to a PIL image
181
+ pil_image = page_bitmap.to_pil()
182
+
183
+ # Apply scaling using the thumbnail approach if specified
184
+ if scale_tuple:
185
+ pil_image.thumbnail(scale_tuple, PIL.Image.LANCZOS)
186
+
187
+ # Convert the PIL image to a NumPy array and force a full copy,
188
+ # ensuring the returned array is entirely independent of the original buffer.
189
+ img_arr = np.array(pil_image).copy()
190
+
191
+ # Apply padding if specified
192
+ if padding_tuple:
193
+ img_arr, (pad_width, pad_height) = pad_image(
194
+ img_arr, target_width=padding_tuple[0], target_height=padding_tuple[1]
195
+ )
196
+ padding_offsets.append((pad_width, pad_height))
197
+ else:
198
+ padding_offsets.append((0, 0))
199
+
200
+ images.append(img_arr)
201
+
202
+ return images, padding_offsets
203
+
204
+
205
+ def convert_pdfium_position(pos, page_width, page_height):
206
+ """
207
+ Convert a PDFium bounding box (which typically has an origin at the bottom-left)
208
+ to a more standard bounding-box format with y=0 at the top.
209
+
210
+ Note:
211
+ This method assumes the PDF coordinate system follows the common convention
212
+ where the origin is at the bottom-left. However, per the PDF specification,
213
+ the coordinate system can theoretically be defined between any opposite corners,
214
+ and its origin may not necessarily be (0,0). This implementation may not handle
215
+ all edge cases where the coordinate system is arbitrarily transformed.
216
+
217
+ Further processing may be necessary downstream, particularly in filtering or
218
+ deduplication stages, to account for variations in coordinate transformations
219
+ and ensure consistent bounding-box comparisons.
220
+
221
+ See https://github.com/pypdfium2-team/pypdfium2/discussions/284.
222
+ """
223
+ left, bottom, right, top = pos
224
+ x0, x1 = left, right
225
+ y0, y1 = page_height - top, page_height - bottom
226
+
227
+ x0 = max(0, x0)
228
+ y0 = max(0, y0)
229
+ x1 = min(page_width, x1)
230
+ y1 = min(page_height, y1)
231
+
232
+ return [int(x0), int(y0), int(x1), int(y1)]
233
+
234
+
235
+ def extract_simple_images_from_pdfium_page(page, max_depth):
236
+ page_width = page.get_width()
237
+ page_height = page.get_height()
238
+
239
+ try:
240
+ image_objects = page.get_objects(
241
+ filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,),
242
+ max_depth=max_depth,
243
+ )
244
+ except Exception as e:
245
+ logger.exception(f"Unhandled error extracting image: {e}")
246
+ return []
247
+
248
+ extracted_images = []
249
+ for obj in image_objects:
250
+ try:
251
+ # Attempt to retrieve the image bitmap
252
+ image_numpy: np.ndarray = pdfium_try_get_bitmap_as_numpy(obj) # noqa
253
+ image_base64: str = numpy_to_base64(image_numpy)
254
+ image_bbox = obj.get_pos()
255
+ image_size = obj.get_size()
256
+ if image_size[0] < 10 and image_size[1] < 10:
257
+ continue
258
+
259
+ image_data = Base64Image(
260
+ image=image_base64,
261
+ bbox=image_bbox,
262
+ width=image_size[0],
263
+ height=image_size[1],
264
+ max_width=page_width,
265
+ max_height=page_height,
266
+ )
267
+ extracted_images.append(image_data)
268
+ except Exception as e:
269
+ logger.exception(f"Unhandled error extracting image: {e}")
270
+ pass # Pdfium failed to extract the image associated with this object - corrupt or missing.
271
+
272
+ return extracted_images
273
+
274
+
275
+ def extract_nested_simple_images_from_pdfium_page(page):
276
+ return extract_simple_images_from_pdfium_page(page, max_depth=2)
277
+
278
+
279
+ def extract_top_level_simple_images_from_pdfium_page(page):
280
+ return extract_simple_images_from_pdfium_page(page, max_depth=1)
281
+
282
+
283
+ def extract_merged_images_from_pdfium_page(page, merge=True, **kwargs):
284
+ """
285
+ Extract bounding boxes of image objects from a PDFium page, with optional merging
286
+ of bounding boxes that likely belong to the same compound image.
287
+ """
288
+ threshold = kwargs.get("images_threshold", 10.0)
289
+ max_num_boxes = kwargs.get("images_max_num_boxes", 1_024)
290
+
291
+ page_width = page.get_width()
292
+ page_height = page.get_height()
293
+
294
+ image_bboxes = []
295
+ for obj in page.get_objects(
296
+ filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,),
297
+ max_depth=1,
298
+ ):
299
+ image_bbox = convert_pdfium_position(obj.get_pos(), page_width, page_height)
300
+ image_bboxes.append(image_bbox)
301
+
302
+ # If no merging is requested or no bounding boxes exist, return the list as is
303
+ if (not merge) or (not image_bboxes):
304
+ return image_bboxes
305
+
306
+ merged_groups = group_bounding_boxes(image_bboxes, threshold=threshold, max_num_boxes=max_num_boxes)
307
+ merged_bboxes = combine_groups_into_bboxes(image_bboxes, merged_groups)
308
+
309
+ return merged_bboxes
310
+
311
+
312
+ def extract_merged_shapes_from_pdfium_page(page, merge=True, **kwargs):
313
+ """
314
+ Extract bounding boxes of path objects (shapes) from a PDFium page, and optionally merge
315
+ those bounding boxes if they appear to be part of the same shape group. Also filters out
316
+ shapes that occupy more than half the page area.
317
+ """
318
+ threshold = kwargs.get("shapes_threshold", 10.0)
319
+ max_num_boxes = kwargs.get("shapes_max_num_boxes", 2_048)
320
+ min_num_components = kwargs.get("shapes_min_num_components", 3)
321
+
322
+ page_width = page.get_width()
323
+ page_height = page.get_height()
324
+ page_area = page_width * page_height
325
+
326
+ path_bboxes = []
327
+ for obj in page.get_objects(
328
+ filter=(pdfium_c.FPDF_PAGEOBJ_PATH,),
329
+ max_depth=1,
330
+ ):
331
+ path_bbox = convert_pdfium_position(obj.get_pos(), page_width, page_height)
332
+ path_bboxes.append(path_bbox)
333
+
334
+ # If merging is disabled or no bounding boxes were found, return them as-is
335
+ if (not merge) or (not path_bboxes):
336
+ return path_bboxes
337
+
338
+ merged_bboxes = []
339
+
340
+ path_groups = group_bounding_boxes(path_bboxes, threshold=threshold, max_num_boxes=max_num_boxes)
341
+ path_bboxes = combine_groups_into_bboxes(path_bboxes, path_groups, min_num_components=min_num_components)
342
+ for bbox in path_bboxes:
343
+ bbox_area = abs(bbox[0] - bbox[2]) * abs(bbox[1] - bbox[3])
344
+ # Exclude shapes that are too large (likely page backgrounds or false positives)
345
+ if bbox_area > 0.5 * page_area:
346
+ continue
347
+ merged_bboxes.append(bbox)
348
+
349
+ return merged_bboxes
350
+
351
+
352
+ def extract_forms_from_pdfium_page(page, **kwargs):
353
+ """
354
+ Extract bounding boxes for PDF form objects from a PDFium page, removing any
355
+ bounding boxes that strictly enclose other boxes (i.e., are strict supersets).
356
+ """
357
+ threshold = kwargs.get("forms_threshold", 10.0)
358
+ max_num_boxes = kwargs.get("forms_max_num_boxes", 1_024)
359
+
360
+ page_width = page.get_width()
361
+ page_height = page.get_height()
362
+ page_area = page_width * page_height
363
+
364
+ form_bboxes = []
365
+ for obj in page.get_objects(
366
+ filter=(pdfium_c.FPDF_PAGEOBJ_FORM,),
367
+ max_depth=1,
368
+ ):
369
+ form_bbox = convert_pdfium_position(obj.get_pos(), page_width, page_height)
370
+ form_bboxes.append(form_bbox)
371
+
372
+ merged_bboxes = []
373
+ form_groups = group_bounding_boxes(form_bboxes, threshold=threshold, max_num_boxes=max_num_boxes)
374
+ form_bboxes = combine_groups_into_bboxes(form_bboxes, form_groups)
375
+ for bbox in form_bboxes:
376
+ bbox_area = abs(bbox[0] - bbox[2]) * abs(bbox[1] - bbox[3])
377
+ # Exclude shapes that are too large (likely page backgrounds or false positives)
378
+ if bbox_area > 0.5 * page_area:
379
+ continue
380
+ merged_bboxes.append(bbox)
381
+
382
+ # Remove any bounding box that strictly encloses another.
383
+ # The larger one is likely a background.
384
+ results = remove_superset_bboxes(merged_bboxes)
385
+
386
+ return results
387
+
388
+
389
+ def extract_image_like_objects_from_pdfium_page(page, merge=True, **kwargs):
390
+ page_width = page.get_width()
391
+ page_height = page.get_height()
392
+ rotation = page.get_rotation()
393
+
394
+ try:
395
+ original_images, _ = pdfium_pages_to_numpy(
396
+ [page], # A batch with a single image.
397
+ render_dpi=72, # dpi = 72 is equivalent to scale = 1.
398
+ rotation=rotation, # Without rotation, coordinates from page.get_pos() will not match.
399
+ )
400
+ image_bboxes = extract_merged_images_from_pdfium_page(page, merge=merge, **kwargs)
401
+ shape_bboxes = extract_merged_shapes_from_pdfium_page(page, merge=merge, **kwargs)
402
+ form_bboxes = extract_forms_from_pdfium_page(page, **kwargs)
403
+ except Exception as e:
404
+ logger.exception(f"Unhandled error extracting image: {e}")
405
+ return []
406
+
407
+ extracted_images = []
408
+ for bbox in image_bboxes + shape_bboxes + form_bboxes:
409
+ try:
410
+ cropped_image = crop_image(original_images[0], bbox, min_width=10, min_height=10)
411
+ if cropped_image is None: # Small images are filtered out.
412
+ continue
413
+ image_base64 = numpy_to_base64(cropped_image)
414
+ image_data = Base64Image(
415
+ image=image_base64,
416
+ bbox=bbox,
417
+ width=bbox[2] - bbox[0],
418
+ height=bbox[3] - bbox[1],
419
+ max_width=page_width,
420
+ max_height=page_height,
421
+ )
422
+ extracted_images.append(image_data)
423
+ except Exception as e:
424
+ logger.exception(f"Unhandled error extracting image: {e}")
425
+ pass # Pdfium failed to extract the image associated with this object - corrupt or missing.
426
+
427
+ return extracted_images
File without changes
@@ -0,0 +1,10 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from nv_ingest_api.util.exception_handlers.schemas import schema_exception_handler
6
+
7
+
8
+ @schema_exception_handler
9
+ def validate_schema(metadata, Schema):
10
+ return Schema(**metadata)
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,86 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from abc import ABC
6
+ from abc import abstractmethod
7
+ from enum import Enum, auto
8
+ from typing import Tuple
9
+
10
+ from nv_ingest_api.internal.schemas.message_brokers.response_schema import ResponseSchema
11
+
12
+
13
+ class FetchMode(Enum):
14
+ DESTRUCTIVE = auto() # Read and delete immediately (current behavior)
15
+ NON_DESTRUCTIVE = auto() # Read without deleting (requires TTL on Redis data)
16
+ CACHE_BEFORE_DELETE = auto() # Read, write to local cache, then delete from Redis
17
+
18
+
19
+ class MessageBrokerClientBase(ABC):
20
+ """
21
+ Abstract base class for a messaging client to interface with various messaging systems.
22
+
23
+ Provides a standard interface for sending and receiving messages with connection management
24
+ and retry logic.
25
+ """
26
+
27
+ @abstractmethod
28
+ def __init__(
29
+ self,
30
+ host: str,
31
+ port: int,
32
+ db: int = 0,
33
+ max_retries: int = 0,
34
+ max_backoff: int = 32,
35
+ connection_timeout: int = 300,
36
+ max_pool_size: int = 128,
37
+ use_ssl: bool = False,
38
+ ):
39
+ """
40
+ Initialize the messaging client with connection parameters.
41
+ """
42
+
43
+ @abstractmethod
44
+ def get_client(self):
45
+ """
46
+ Returns the client instance, reconnecting if necessary.
47
+
48
+ Returns:
49
+ The client instance.
50
+ """
51
+
52
+ @abstractmethod
53
+ def ping(self) -> bool:
54
+ """
55
+ Checks if the server is responsive.
56
+
57
+ Returns:
58
+ True if the server responds to a ping, False otherwise.
59
+ """
60
+
61
+ @abstractmethod
62
+ def fetch_message(
63
+ self, job_index: str, timeout: Tuple[int, float] = (100, None), override_fetch_mode: FetchMode = None
64
+ ) -> ResponseSchema:
65
+ """
66
+ Fetches a message from the specified queue with retries on failure.
67
+
68
+ Parameters:
69
+ job_index (str): The index of the job to fetch the message for.
70
+ timeout (float): The timeout in seconds for blocking until a message is available.
71
+ override_fetch_mode: Optional; overrides the default fetch mode.
72
+
73
+ Returns:
74
+ The fetched message, or None if no message could be fetched.
75
+ """
76
+
77
+ @abstractmethod
78
+ def submit_message(self, channel_name: str, message: str, for_nv_ingest=False) -> ResponseSchema:
79
+ """
80
+ Submits a message to a specified queue with retries on failure.
81
+
82
+ Parameters:
83
+ channel_name (str): The name of the queue to submit the message to.
84
+ message (str): The message to submit.
85
+ for_nv_ingest (bool): Whether the message is for NV Ingest.
86
+ """
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
File without changes