nv-ingest-api 2025.4.20.dev20250420__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,469 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import base64
7
+ import io
8
+ import uuid
9
+ from dataclasses import dataclass
10
+ from datetime import datetime
11
+ from typing import Any
12
+ from typing import Dict
13
+ from typing import List
14
+ from typing import Optional
15
+ from typing import Tuple
16
+
17
+ import pandas as pd
18
+ import pypdfium2 as pdfium
19
+ from PIL import Image
20
+ from pypdfium2 import PdfImage
21
+
22
+ from nv_ingest_api.internal.enums.common import ContentDescriptionEnum, DocumentTypeEnum
23
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
24
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import NearbyObjectsSchema
25
+ from nv_ingest_api.internal.enums.common import TableFormatEnum
26
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
27
+ from nv_ingest_api.util.converters import datetools
28
+ from nv_ingest_api.util.detectors.language import detect_language
29
+ from nv_ingest_api.util.exception_handlers.pdf import pdfium_exception_handler
30
+
31
+
32
+ @dataclass
33
+ class CroppedImageWithContent:
34
+ content: str
35
+ image: str
36
+ bbox: Tuple[int, int, int, int]
37
+ max_width: int
38
+ max_height: int
39
+ type_string: str
40
+ content_format: str = ""
41
+
42
+
43
+ @dataclass
44
+ class LatexTable:
45
+ latex: pd.DataFrame
46
+ bbox: Tuple[int, int, int, int]
47
+ max_width: int
48
+ max_height: int
49
+
50
+
51
+ @dataclass
52
+ class Base64Image:
53
+ image: str
54
+ bbox: Tuple[int, int, int, int]
55
+ width: int
56
+ height: int
57
+ max_width: int
58
+ max_height: int
59
+
60
+
61
+ @dataclass
62
+ class PDFMetadata:
63
+ """
64
+ A data object to store metadata information extracted from a PDF document.
65
+ """
66
+
67
+ page_count: int
68
+ filename: str
69
+ last_modified: str
70
+ date_created: str
71
+ keywords: List[str]
72
+ source_type: str = "PDF"
73
+
74
+
75
+ def extract_pdf_metadata(doc: pdfium.PdfDocument, source_id: str) -> PDFMetadata:
76
+ """
77
+ Extracts metadata and relevant information from a PDF document.
78
+
79
+ Parameters
80
+ ----------
81
+ pdf_stream : bytes
82
+ The PDF document data as a byte stream.
83
+ source_id : str
84
+ The identifier for the source document, typically the filename.
85
+
86
+ Returns
87
+ -------
88
+ PDFMetadata
89
+ An object containing extracted metadata and information including:
90
+ - `page_count`: The total number of pages in the PDF.
91
+ - `filename`: The source filename or identifier.
92
+ - `last_modified`: The last modified date of the PDF document.
93
+ - `date_created`: The creation date of the PDF document.
94
+ - `keywords`: Keywords associated with the PDF document.
95
+ - `source_type`: The type/format of the source, e.g., "PDF".
96
+
97
+ Raises
98
+ ------
99
+ PdfiumError
100
+ If there is an issue processing the PDF document.
101
+ """
102
+ page_count: int = len(doc)
103
+ filename: str = source_id
104
+
105
+ # Extract document metadata
106
+ doc_meta = doc.get_metadata_dict()
107
+
108
+ # Extract and process the last modified date
109
+ last_modified: str = doc_meta.get("ModDate")
110
+ if last_modified in (None, ""):
111
+ last_modified = datetools.remove_tz(datetime.now()).isoformat()
112
+ else:
113
+ last_modified = datetools.datetimefrompdfmeta(last_modified)
114
+
115
+ # Extract and process the creation date
116
+ date_created: str = doc_meta.get("CreationDate")
117
+ if date_created in (None, ""):
118
+ date_created = datetools.remove_tz(datetime.now()).isoformat()
119
+ else:
120
+ date_created = datetools.datetimefrompdfmeta(date_created)
121
+
122
+ # Extract keywords, defaulting to an empty list if not found
123
+ keywords: List[str] = doc_meta.get("Keywords", [])
124
+
125
+ # Create the PDFMetadata object
126
+ metadata = PDFMetadata(
127
+ page_count=page_count,
128
+ filename=filename,
129
+ last_modified=last_modified,
130
+ date_created=date_created,
131
+ keywords=keywords,
132
+ )
133
+
134
+ return metadata
135
+
136
+
137
+ def construct_text_metadata(
138
+ accumulated_text,
139
+ keywords,
140
+ page_idx,
141
+ block_idx,
142
+ line_idx,
143
+ span_idx,
144
+ page_count,
145
+ text_depth,
146
+ source_metadata,
147
+ base_unified_metadata,
148
+ delimiter=" ",
149
+ bbox_max_dimensions: Tuple[int, int] = (-1, -1),
150
+ nearby_objects: Optional[Dict[str, Any]] = None,
151
+ ):
152
+ extracted_text = delimiter.join(accumulated_text)
153
+
154
+ content_metadata = {
155
+ "type": ContentTypeEnum.TEXT,
156
+ "description": ContentDescriptionEnum.PDF_TEXT,
157
+ "page_number": page_idx,
158
+ "hierarchy": {
159
+ "page_count": page_count,
160
+ "page": page_idx,
161
+ "block": -1,
162
+ "line": -1,
163
+ "span": -1,
164
+ "nearby_objects": nearby_objects or NearbyObjectsSchema(),
165
+ },
166
+ }
167
+
168
+ language = detect_language(extracted_text)
169
+
170
+ # TODO(Devin) - Implement bounding box logic for text
171
+ bbox = (-1, -1, -1, -1)
172
+
173
+ text_metadata = {
174
+ "text_type": text_depth,
175
+ "summary": "",
176
+ "keywords": keywords,
177
+ "language": language,
178
+ "text_location": bbox,
179
+ "text_location_max_dimensions": bbox_max_dimensions,
180
+ }
181
+
182
+ ext_unified_metadata = base_unified_metadata.copy()
183
+
184
+ ext_unified_metadata.update(
185
+ {
186
+ "content": extracted_text,
187
+ "source_metadata": source_metadata,
188
+ "content_metadata": content_metadata,
189
+ "text_metadata": text_metadata,
190
+ }
191
+ )
192
+
193
+ validated_unified_metadata = validate_metadata(ext_unified_metadata)
194
+
195
+ return [ContentTypeEnum.TEXT, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
196
+
197
+
198
+ def construct_image_metadata_from_base64(
199
+ base64_image: str,
200
+ page_idx: int,
201
+ page_count: int,
202
+ source_metadata: Dict[str, Any],
203
+ base_unified_metadata: Dict[str, Any],
204
+ ) -> List[Any]:
205
+ """
206
+ Extracts image data from a base64-encoded image string, decodes the image to get
207
+ its dimensions and bounding box, and constructs metadata for the image.
208
+
209
+ Parameters
210
+ ----------
211
+ base64_image : str
212
+ A base64-encoded string representing the image.
213
+ page_idx : int
214
+ The index of the current page being processed.
215
+ page_count : int
216
+ The total number of pages in the PDF document.
217
+ source_metadata : Dict[str, Any]
218
+ Metadata related to the source of the PDF document.
219
+ base_unified_metadata : Dict[str, Any]
220
+ The base unified metadata structure to be updated with the extracted image information.
221
+
222
+ Returns
223
+ -------
224
+ List[Any]
225
+ A list containing the content type, validated metadata dictionary, and a UUID string.
226
+
227
+ Raises
228
+ ------
229
+ ValueError
230
+ If the image cannot be decoded from the base64 string.
231
+ """
232
+ # Decode the base64 image
233
+ try:
234
+ image_data = base64.b64decode(base64_image)
235
+ image = Image.open(io.BytesIO(image_data))
236
+ except Exception as e:
237
+ raise ValueError(f"Failed to decode image from base64: {e}")
238
+
239
+ # Extract image dimensions and bounding box
240
+ width, height = image.size
241
+ bbox = (0, 0, width, height) # Assuming the full image as the bounding box
242
+
243
+ # Construct content metadata
244
+ content_metadata: Dict[str, Any] = {
245
+ "type": ContentTypeEnum.IMAGE,
246
+ "description": ContentDescriptionEnum.PDF_IMAGE,
247
+ "page_number": page_idx,
248
+ "hierarchy": {
249
+ "page_count": page_count,
250
+ "page": page_idx,
251
+ "block": -1,
252
+ "line": -1,
253
+ "span": -1,
254
+ },
255
+ }
256
+
257
+ # Construct image metadata
258
+ image_metadata: Dict[str, Any] = {
259
+ "image_type": DocumentTypeEnum.PNG,
260
+ "structured_image_type": ContentTypeEnum.UNKNOWN,
261
+ "caption": "",
262
+ "text": "",
263
+ "image_location": bbox,
264
+ "image_location_max_dimensions": (width, height),
265
+ "height": height,
266
+ }
267
+
268
+ # Update the unified metadata with the extracted image information
269
+ unified_metadata: Dict[str, Any] = base_unified_metadata.copy()
270
+ unified_metadata.update(
271
+ {
272
+ "content": base64_image,
273
+ "source_metadata": source_metadata,
274
+ "content_metadata": content_metadata,
275
+ "image_metadata": image_metadata,
276
+ }
277
+ )
278
+
279
+ # Validate and return the unified metadata
280
+ validated_unified_metadata = validate_metadata(unified_metadata)
281
+ return [ContentTypeEnum.IMAGE, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
282
+
283
+
284
+ def construct_image_metadata_from_pdf_image(
285
+ pdf_image: PdfImage,
286
+ page_idx: int,
287
+ page_count: int,
288
+ source_metadata: Dict[str, Any],
289
+ base_unified_metadata: Dict[str, Any],
290
+ ) -> List[Any]:
291
+ """
292
+ Extracts image data from a PdfImage object, converts it to a base64-encoded string,
293
+ and constructs metadata for the image.
294
+
295
+ Parameters
296
+ ----------
297
+ image_obj : PdfImage
298
+ The PdfImage object from which the image will be extracted.
299
+ page_idx : int
300
+ The index of the current page being processed.
301
+ page_count : int
302
+ The total number of pages in the PDF document.
303
+ source_metadata : dict
304
+ Metadata related to the source of the PDF document.
305
+ base_unified_metadata : dict
306
+ The base unified metadata structure to be updated with the extracted image information.
307
+
308
+ Returns
309
+ -------
310
+ List[Any]
311
+ A list containing the content type, validated metadata dictionary, and a UUID string.
312
+
313
+ Raises
314
+ ------
315
+ PdfiumError
316
+ If the image cannot be extracted due to an issue with the PdfImage object.
317
+ :param pdf_image:
318
+ """
319
+
320
+ # Construct content metadata
321
+ content_metadata: Dict[str, Any] = {
322
+ "type": ContentTypeEnum.IMAGE,
323
+ "description": ContentDescriptionEnum.PDF_IMAGE,
324
+ "page_number": page_idx,
325
+ "hierarchy": {
326
+ "page_count": page_count,
327
+ "page": page_idx,
328
+ "block": -1,
329
+ "line": -1,
330
+ "span": -1,
331
+ },
332
+ }
333
+
334
+ # Construct image metadata
335
+ image_metadata: Dict[str, Any] = {
336
+ "image_type": DocumentTypeEnum.PNG,
337
+ "structured_image_type": ContentTypeEnum.UNKNOWN,
338
+ "caption": "",
339
+ "text": "",
340
+ "image_location": pdf_image.bbox,
341
+ "image_location_max_dimensions": (max(pdf_image.max_width, 0), max(pdf_image.max_height, 0)),
342
+ "height": pdf_image.height,
343
+ "width": pdf_image.width,
344
+ }
345
+
346
+ # Update the unified metadata with the extracted image information
347
+ unified_metadata: Dict[str, Any] = base_unified_metadata.copy()
348
+ unified_metadata.update(
349
+ {
350
+ "content": pdf_image.image,
351
+ "source_metadata": source_metadata,
352
+ "content_metadata": content_metadata,
353
+ "image_metadata": image_metadata,
354
+ }
355
+ )
356
+
357
+ # Validate and return the unified metadata
358
+ validated_unified_metadata = validate_metadata(unified_metadata)
359
+ return [ContentTypeEnum.IMAGE, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
360
+
361
+
362
+ # TODO(Devin): Disambiguate tables and charts, create two distinct processing methods
363
+ @pdfium_exception_handler(descriptor="pdfium")
364
+ def construct_page_element_metadata(
365
+ structured_image: CroppedImageWithContent,
366
+ page_idx: int,
367
+ page_count: int,
368
+ source_metadata: Dict,
369
+ base_unified_metadata: Dict,
370
+ ):
371
+ """
372
+ +--------------------------------+--------------------------+------------+---+
373
+ | Table/Chart Metadata | | Extracted | Y |
374
+ | (tables within documents) | | | |
375
+ +--------------------------------+--------------------------+------------+---+
376
+ | Table format | Structured (dataframe / | Extracted | |
377
+ | | lists of rows and | | |
378
+ | | columns), or serialized | | |
379
+ | | as markdown, html, | | |
380
+ | | latex, simple (cells | | |
381
+ | | separated just as spaces)| | |
382
+ +--------------------------------+--------------------------+------------+---+
383
+ | Table content | Extracted text content | | |
384
+ | | | | |
385
+ | | Important: Tables should | | |
386
+ | | not be chunked | | |
387
+ +--------------------------------+--------------------------+------------+---+
388
+ | Table location | Bounding box of the table| | |
389
+ +--------------------------------+--------------------------+------------+---+
390
+ | Caption | Detected captions for | | |
391
+ | | the table/chart | | |
392
+ +--------------------------------+--------------------------+------------+---+
393
+ | uploaded_image_uri | Mirrors | | |
394
+ | | source_metadata. | | |
395
+ | | source_location | | |
396
+ +--------------------------------+--------------------------+------------+---+
397
+ """
398
+
399
+ if structured_image.type_string in ("table",):
400
+ content = structured_image.image
401
+ structured_content_text = structured_image.content
402
+ structured_content_format = structured_image.content_format
403
+ table_format = TableFormatEnum.IMAGE
404
+ subtype = ContentTypeEnum.TABLE
405
+ description = ContentDescriptionEnum.PDF_TABLE
406
+ meta_name = "table_metadata"
407
+
408
+ elif structured_image.type_string in ("chart",):
409
+ content = structured_image.image
410
+ structured_content_text = structured_image.content
411
+ structured_content_format = structured_image.content_format
412
+ table_format = TableFormatEnum.IMAGE
413
+ subtype = ContentTypeEnum.CHART
414
+ description = ContentDescriptionEnum.PDF_CHART
415
+ # TODO(Devin) swap this to chart_metadata after we confirm metadata schema changes.
416
+ meta_name = "table_metadata"
417
+
418
+ elif structured_image.type_string in ("infographic",):
419
+ content = structured_image.image
420
+ structured_content_text = structured_image.content
421
+ structured_content_format = structured_image.content_format
422
+ table_format = TableFormatEnum.IMAGE
423
+ subtype = ContentTypeEnum.INFOGRAPHIC
424
+ description = ContentDescriptionEnum.PDF_INFOGRAPHIC
425
+ meta_name = "table_metadata"
426
+
427
+ else:
428
+ raise ValueError(f"Unknown table/chart/infographic type: {structured_image.type_string}")
429
+
430
+ content_metadata = {
431
+ "type": ContentTypeEnum.STRUCTURED,
432
+ "description": description,
433
+ "page_number": page_idx,
434
+ "hierarchy": {
435
+ "page_count": page_count,
436
+ "page": page_idx,
437
+ "line": -1,
438
+ "span": -1,
439
+ },
440
+ "subtype": subtype,
441
+ }
442
+
443
+ structured_metadata = {
444
+ "caption": "",
445
+ "table_format": table_format,
446
+ "table_content": structured_content_text,
447
+ "table_content_format": structured_content_format,
448
+ "table_location": structured_image.bbox,
449
+ "table_location_max_dimensions": (structured_image.max_width, structured_image.max_height),
450
+ }
451
+
452
+ ext_unified_metadata = base_unified_metadata.copy()
453
+
454
+ ext_unified_metadata.update(
455
+ {
456
+ "content": content,
457
+ "source_metadata": source_metadata,
458
+ "content_metadata": content_metadata,
459
+ meta_name: structured_metadata,
460
+ }
461
+ )
462
+
463
+ validated_unified_metadata = validate_metadata(ext_unified_metadata)
464
+
465
+ return [ContentTypeEnum.STRUCTURED, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
466
+
467
+
468
+ # TODO: remove this alias
469
+ construct_table_and_chart_metadata = construct_page_element_metadata
@@ -0,0 +1,8 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ from .mp_pool_singleton import ProcessWorkerPoolSingleton
7
+
8
+ __all__ = ["ProcessWorkerPoolSingleton"]
@@ -0,0 +1,194 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+ import math
8
+ import multiprocessing as mp
9
+ import os
10
+ from threading import Lock
11
+ from typing import Any, Callable, Optional
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class SimpleFuture:
17
+ """
18
+ A simplified future object that uses a multiprocessing Pipe to receive its result.
19
+
20
+ When the result() method is called, it blocks until the worker sends a tuple
21
+ (result, error) over the pipe.
22
+ """
23
+
24
+ def __init__(self, parent_conn: mp.connection.Connection) -> None:
25
+ """
26
+ Parameters
27
+ ----------
28
+ parent_conn : mp.connection.Connection
29
+ The parent end of the multiprocessing Pipe used to receive the result.
30
+ """
31
+ self._parent_conn: mp.connection.Connection = parent_conn
32
+
33
+ def result(self) -> Any:
34
+ """
35
+ Retrieve the result from the future, blocking until it is available.
36
+
37
+ Returns
38
+ -------
39
+ Any
40
+ The result returned by the worker function.
41
+
42
+ Raises
43
+ ------
44
+ Exception
45
+ If the worker function raised an exception, it is re-raised here.
46
+ """
47
+ result, error = self._parent_conn.recv()
48
+ if error is not None:
49
+ raise error
50
+ return result
51
+
52
+
53
+ class ProcessWorkerPoolSingleton:
54
+ """
55
+ A singleton process worker pool using a dual-queue implementation.
56
+
57
+ Instead of a global result queue, each submitted task gets its own Pipe.
58
+ The submit_task() method returns a SimpleFuture, whose result() call blocks
59
+ until the task completes.
60
+ """
61
+
62
+ _instance: Optional["ProcessWorkerPoolSingleton"] = None
63
+ _lock: Lock = Lock()
64
+ _total_workers: int = 0
65
+
66
+ def __new__(cls) -> "ProcessWorkerPoolSingleton":
67
+ """
68
+ Create or return the singleton instance of ProcessWorkerPoolSingleton.
69
+
70
+ Returns
71
+ -------
72
+ ProcessWorkerPoolSingleton
73
+ The singleton instance.
74
+ """
75
+ logger.debug("Creating ProcessWorkerPoolSingleton instance...")
76
+ with cls._lock:
77
+ if cls._instance is None:
78
+ max_worker_limit: int = int(os.environ.get("MAX_INGEST_PROCESS_WORKERS", -1))
79
+ instance = super().__new__(cls)
80
+ # Determine available CPU count using affinity if possible
81
+ available: Optional[int] = (
82
+ len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else os.cpu_count()
83
+ )
84
+ # Use 40% of available CPUs, ensuring at least one worker
85
+ max_workers: int = math.floor(max(1, available * 0.4))
86
+ if (max_worker_limit > 0) and (max_workers > max_worker_limit):
87
+ max_workers = max_worker_limit
88
+ logger.debug("Creating ProcessWorkerPoolSingleton instance with max workers: %d", max_workers)
89
+ instance._initialize(max_workers)
90
+ logger.debug("ProcessWorkerPoolSingleton instance created: %s", instance)
91
+ cls._instance = instance
92
+ else:
93
+ logger.debug("ProcessWorkerPoolSingleton instance already exists: %s", cls._instance)
94
+ return cls._instance
95
+
96
+ def _initialize(self, total_max_workers: int) -> None:
97
+ """
98
+ Initialize the worker pool with the specified number of worker processes.
99
+
100
+ Parameters
101
+ ----------
102
+ total_max_workers : int
103
+ The total number of worker processes to start.
104
+ """
105
+ self._total_workers = total_max_workers
106
+ self._context: mp.context.ForkContext = mp.get_context("fork")
107
+ # Bounded task queue: maximum tasks queued = 2 * total_max_workers.
108
+ self._task_queue: mp.Queue = self._context.Queue(maxsize=2 * total_max_workers)
109
+ self._next_task_id: int = 0
110
+ self._processes: list[mp.Process] = []
111
+ logger.debug(
112
+ "Initializing ProcessWorkerPoolSingleton with %d workers and queue size %d.",
113
+ total_max_workers,
114
+ 2 * total_max_workers,
115
+ )
116
+ for i in range(total_max_workers):
117
+ p: mp.Process = self._context.Process(target=self._worker, args=(self._task_queue,))
118
+ p.start()
119
+ self._processes.append(p)
120
+ logger.debug("Started worker process %d/%d: PID %d", i + 1, total_max_workers, p.pid)
121
+ logger.debug("Initialized with max workers: %d", total_max_workers)
122
+
123
+ @staticmethod
124
+ def _worker(task_queue: mp.Queue) -> None:
125
+ """
126
+ Worker process that continuously processes tasks from the task queue.
127
+
128
+ Parameters
129
+ ----------
130
+ task_queue : mp.Queue
131
+ The queue from which tasks are retrieved.
132
+ """
133
+ logger.debug("Worker process started: PID %d", os.getpid())
134
+ while True:
135
+ task = task_queue.get()
136
+ if task is None:
137
+ # Stop signal received; exit the loop.
138
+ logger.debug("Worker process %d received stop signal.", os.getpid())
139
+ break
140
+ # Unpack task: (task_id, process_fn, args, child_conn)
141
+ task_id, process_fn, args, child_conn = task
142
+ try:
143
+ result = process_fn(*args)
144
+ child_conn.send((result, None))
145
+ except Exception as e:
146
+ logger.error("Task %d error in worker %d: %s", task_id, os.getpid(), e)
147
+ child_conn.send((None, e))
148
+ finally:
149
+ child_conn.close()
150
+
151
+ def submit_task(self, process_fn: Callable, *args: Any) -> SimpleFuture:
152
+ """
153
+ Submits a task to the worker pool for asynchronous execution.
154
+
155
+ If a single tuple is passed as the only argument, it is unpacked.
156
+
157
+ Parameters
158
+ ----------
159
+ process_fn : Callable
160
+ The function to be executed asynchronously.
161
+ *args : Any
162
+ The arguments to pass to the process function. If a single argument is a tuple,
163
+ it will be unpacked as the function arguments.
164
+
165
+ Returns
166
+ -------
167
+ SimpleFuture
168
+ A future object that can be used to retrieve the result of the task.
169
+ """
170
+ # Unpack tuple if a single tuple argument is provided.
171
+ if len(args) == 1 and isinstance(args[0], tuple):
172
+ args = args[0]
173
+ parent_conn, child_conn = mp.Pipe(duplex=False)
174
+ task_id: int = self._next_task_id
175
+ self._next_task_id += 1
176
+ self._task_queue.put((task_id, process_fn, args, child_conn))
177
+ return SimpleFuture(parent_conn)
178
+
179
+ def close(self) -> None:
180
+ """
181
+ Closes the worker pool and terminates all worker processes.
182
+
183
+ Sends a stop signal to each worker and waits for them to terminate.
184
+ """
185
+ logger.debug("Closing ProcessWorkerPoolSingleton...")
186
+ # Send a stop signal (None) for each worker.
187
+ for _ in range(self._total_workers):
188
+ self._task_queue.put(None)
189
+ logger.debug("Sent stop signal to worker.")
190
+ # Wait for all processes to finish.
191
+ for i, p in enumerate(self._processes):
192
+ p.join()
193
+ logger.debug("Worker process %d/%d joined: PID %d", i + 1, self._total_workers, p.pid)
194
+ logger.debug("ProcessWorkerPoolSingleton closed.")