nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,516 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import base64
7
+ import io
8
+ import uuid
9
+ from dataclasses import dataclass
10
+ from datetime import datetime
11
+ from typing import Any
12
+ from typing import Dict
13
+ from typing import List
14
+ from typing import Optional
15
+ from typing import Tuple
16
+
17
+ import pandas as pd
18
+ import pypdfium2 as pdfium
19
+ from PIL import Image
20
+ from pypdfium2 import PdfImage
21
+
22
+ from nv_ingest_api.internal.enums.common import ContentDescriptionEnum, DocumentTypeEnum
23
+ from nv_ingest_api.internal.enums.common import ContentTypeEnum
24
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import NearbyObjectsSchema
25
+ from nv_ingest_api.internal.enums.common import TableFormatEnum
26
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
27
+ from nv_ingest_api.util.converters import datetools
28
+ from nv_ingest_api.util.detectors.language import detect_language
29
+ from nv_ingest_api.util.exception_handlers.pdf import pdfium_exception_handler
30
+
31
+
32
+ @dataclass
33
+ class CroppedImageWithContent:
34
+ content: str
35
+ image: str
36
+ bbox: Tuple[int, int, int, int]
37
+ max_width: int
38
+ max_height: int
39
+ type_string: str
40
+ content_format: str = ""
41
+
42
+
43
+ @dataclass
44
+ class LatexTable:
45
+ latex: pd.DataFrame
46
+ bbox: Tuple[int, int, int, int]
47
+ max_width: int
48
+ max_height: int
49
+
50
+
51
+ @dataclass
52
+ class Base64Image:
53
+ image: str
54
+ bbox: Tuple[int, int, int, int]
55
+ width: int
56
+ height: int
57
+ max_width: int
58
+ max_height: int
59
+
60
+
61
+ @dataclass
62
+ class PDFMetadata:
63
+ """
64
+ A data object to store metadata information extracted from a PDF document.
65
+ """
66
+
67
+ page_count: int
68
+ filename: str
69
+ last_modified: str
70
+ date_created: str
71
+ keywords: List[str]
72
+ source_type: str = "PDF"
73
+
74
+
75
+ def extract_pdf_metadata(doc: pdfium.PdfDocument, source_id: str) -> PDFMetadata:
76
+ """
77
+ Extracts metadata and relevant information from a PDF document.
78
+
79
+ Parameters
80
+ ----------
81
+ pdf_stream : bytes
82
+ The PDF document data as a byte stream.
83
+ source_id : str
84
+ The identifier for the source document, typically the filename.
85
+
86
+ Returns
87
+ -------
88
+ PDFMetadata
89
+ An object containing extracted metadata and information including:
90
+ - `page_count`: The total number of pages in the PDF.
91
+ - `filename`: The source filename or identifier.
92
+ - `last_modified`: The last modified date of the PDF document.
93
+ - `date_created`: The creation date of the PDF document.
94
+ - `keywords`: Keywords associated with the PDF document.
95
+ - `source_type`: The type/format of the source, e.g., "PDF".
96
+
97
+ Raises
98
+ ------
99
+ PdfiumError
100
+ If there is an issue processing the PDF document.
101
+ """
102
+ page_count: int = len(doc)
103
+ filename: str = source_id
104
+
105
+ # Extract document metadata
106
+ doc_meta = doc.get_metadata_dict()
107
+
108
+ # Extract and process the last modified date
109
+ last_modified: str = doc_meta.get("ModDate")
110
+ if last_modified in (None, ""):
111
+ last_modified = datetools.remove_tz(datetime.now()).isoformat()
112
+ else:
113
+ last_modified = datetools.datetimefrompdfmeta(last_modified)
114
+
115
+ # Extract and process the creation date
116
+ date_created: str = doc_meta.get("CreationDate")
117
+ if date_created in (None, ""):
118
+ date_created = datetools.remove_tz(datetime.now()).isoformat()
119
+ else:
120
+ date_created = datetools.datetimefrompdfmeta(date_created)
121
+
122
+ # Extract keywords, defaulting to an empty list if not found
123
+ keywords: List[str] = doc_meta.get("Keywords", [])
124
+
125
+ # Create the PDFMetadata object
126
+ metadata = PDFMetadata(
127
+ page_count=page_count,
128
+ filename=filename,
129
+ last_modified=last_modified,
130
+ date_created=date_created,
131
+ keywords=keywords,
132
+ )
133
+
134
+ return metadata
135
+
136
+
137
+ def construct_text_metadata(
138
+ accumulated_text,
139
+ keywords,
140
+ page_idx,
141
+ block_idx,
142
+ line_idx,
143
+ span_idx,
144
+ page_count,
145
+ text_depth,
146
+ source_metadata,
147
+ base_unified_metadata,
148
+ delimiter=" ",
149
+ bbox_max_dimensions: Tuple[int, int] = (-1, -1),
150
+ nearby_objects: Optional[Dict[str, Any]] = None,
151
+ ):
152
+ extracted_text = delimiter.join(accumulated_text)
153
+
154
+ content_metadata = {
155
+ "type": ContentTypeEnum.TEXT,
156
+ "description": ContentDescriptionEnum.PDF_TEXT,
157
+ "page_number": page_idx,
158
+ "hierarchy": {
159
+ "page_count": page_count,
160
+ "page": page_idx,
161
+ "block": -1,
162
+ "line": -1,
163
+ "span": -1,
164
+ "nearby_objects": nearby_objects or NearbyObjectsSchema(),
165
+ },
166
+ }
167
+
168
+ language = detect_language(extracted_text)
169
+
170
+ # TODO(Devin) - Implement bounding box logic for text
171
+ bbox = (-1, -1, -1, -1)
172
+
173
+ text_metadata = {
174
+ "text_type": text_depth,
175
+ "summary": "",
176
+ "keywords": keywords,
177
+ "language": language,
178
+ "text_location": bbox,
179
+ "text_location_max_dimensions": bbox_max_dimensions,
180
+ }
181
+
182
+ ext_unified_metadata = base_unified_metadata.copy()
183
+
184
+ ext_unified_metadata.update(
185
+ {
186
+ "content": extracted_text,
187
+ "source_metadata": source_metadata,
188
+ "content_metadata": content_metadata,
189
+ "text_metadata": text_metadata,
190
+ }
191
+ )
192
+
193
+ validated_unified_metadata = validate_metadata(ext_unified_metadata)
194
+
195
+ return [ContentTypeEnum.TEXT, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
196
+
197
+
198
+ def construct_image_metadata_from_base64(
199
+ base64_image: str,
200
+ page_idx: int,
201
+ page_count: int,
202
+ source_metadata: Dict[str, Any],
203
+ base_unified_metadata: Dict[str, Any],
204
+ subtype: None | ContentTypeEnum | str = "",
205
+ text: str = "",
206
+ ) -> List[Any]:
207
+ """
208
+ Extracts image data from a base64-encoded image string, decodes the image to get
209
+ its dimensions and bounding box, and constructs metadata for the image.
210
+
211
+ Parameters
212
+ ----------
213
+ base64_image : str
214
+ A base64-encoded string representing the image.
215
+ page_idx : int
216
+ The index of the current page being processed.
217
+ page_count : int
218
+ The total number of pages in the PDF document.
219
+ source_metadata : Dict[str, Any]
220
+ Metadata related to the source of the PDF document.
221
+ base_unified_metadata : Dict[str, Any]
222
+ The base unified metadata structure to be updated with the extracted image information.
223
+
224
+ Returns
225
+ -------
226
+ List[Any]
227
+ A list containing the content type, validated metadata dictionary, and a UUID string.
228
+
229
+ Raises
230
+ ------
231
+ ValueError
232
+ If the image cannot be decoded from the base64 string.
233
+ """
234
+ # Decode the base64 image
235
+ try:
236
+ image_data = base64.b64decode(base64_image)
237
+ image = Image.open(io.BytesIO(image_data))
238
+ except Exception as e:
239
+ raise ValueError(f"Failed to decode image from base64: {e}")
240
+
241
+ # Extract image dimensions and bounding box
242
+ width, height = image.size
243
+ bbox = (0, 0, width, height) # Assuming the full image as the bounding box
244
+
245
+ # Construct content metadata
246
+ content_metadata: Dict[str, Any] = {
247
+ "type": ContentTypeEnum.IMAGE,
248
+ "description": ContentDescriptionEnum.PDF_IMAGE,
249
+ "page_number": page_idx,
250
+ "hierarchy": {
251
+ "page_count": page_count,
252
+ "page": page_idx,
253
+ "block": -1,
254
+ "line": -1,
255
+ "span": -1,
256
+ },
257
+ "subtype": subtype or "",
258
+ }
259
+
260
+ # Construct image metadata
261
+ image_metadata: Dict[str, Any] = {
262
+ "image_type": DocumentTypeEnum.PNG,
263
+ "structured_image_type": ContentTypeEnum.UNKNOWN,
264
+ "caption": "",
265
+ "text": text,
266
+ "image_location": bbox,
267
+ "image_location_max_dimensions": (width, height),
268
+ "height": height,
269
+ }
270
+
271
+ # Update the unified metadata with the extracted image information
272
+ unified_metadata: Dict[str, Any] = base_unified_metadata.copy()
273
+ unified_metadata.update(
274
+ {
275
+ "content": base64_image,
276
+ "source_metadata": source_metadata,
277
+ "content_metadata": content_metadata,
278
+ "image_metadata": image_metadata,
279
+ }
280
+ )
281
+
282
+ # Validate and return the unified metadata
283
+ validated_unified_metadata = validate_metadata(unified_metadata)
284
+ return [ContentTypeEnum.IMAGE, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
285
+
286
+
287
+ def construct_image_metadata_from_pdf_image(
288
+ pdf_image: PdfImage,
289
+ page_idx: int,
290
+ page_count: int,
291
+ source_metadata: Dict[str, Any],
292
+ base_unified_metadata: Dict[str, Any],
293
+ ) -> List[Any]:
294
+ """
295
+ Extracts image data from a PdfImage object, converts it to a base64-encoded string,
296
+ and constructs metadata for the image.
297
+
298
+ Parameters
299
+ ----------
300
+ image_obj : PdfImage
301
+ The PdfImage object from which the image will be extracted.
302
+ page_idx : int
303
+ The index of the current page being processed.
304
+ page_count : int
305
+ The total number of pages in the PDF document.
306
+ source_metadata : dict
307
+ Metadata related to the source of the PDF document.
308
+ base_unified_metadata : dict
309
+ The base unified metadata structure to be updated with the extracted image information.
310
+
311
+ Returns
312
+ -------
313
+ List[Any]
314
+ A list containing the content type, validated metadata dictionary, and a UUID string.
315
+
316
+ Raises
317
+ ------
318
+ PdfiumError
319
+ If the image cannot be extracted due to an issue with the PdfImage object.
320
+ :param pdf_image:
321
+ """
322
+
323
+ # Construct content metadata
324
+ content_metadata: Dict[str, Any] = {
325
+ "type": ContentTypeEnum.IMAGE,
326
+ "description": ContentDescriptionEnum.PDF_IMAGE,
327
+ "page_number": page_idx,
328
+ "hierarchy": {
329
+ "page_count": page_count,
330
+ "page": page_idx,
331
+ "block": -1,
332
+ "line": -1,
333
+ "span": -1,
334
+ },
335
+ }
336
+
337
+ # Construct image metadata
338
+ image_metadata: Dict[str, Any] = {
339
+ "image_type": DocumentTypeEnum.PNG,
340
+ "structured_image_type": ContentTypeEnum.UNKNOWN,
341
+ "caption": "",
342
+ "text": "",
343
+ "image_location": pdf_image.bbox,
344
+ "image_location_max_dimensions": (max(pdf_image.max_width, 0), max(pdf_image.max_height, 0)),
345
+ "height": pdf_image.height,
346
+ "width": pdf_image.width,
347
+ }
348
+
349
+ # Update the unified metadata with the extracted image information
350
+ unified_metadata: Dict[str, Any] = base_unified_metadata.copy()
351
+ unified_metadata.update(
352
+ {
353
+ "content": pdf_image.image,
354
+ "source_metadata": source_metadata,
355
+ "content_metadata": content_metadata,
356
+ "image_metadata": image_metadata,
357
+ }
358
+ )
359
+
360
+ # Validate and return the unified metadata
361
+ validated_unified_metadata = validate_metadata(unified_metadata)
362
+ return [ContentTypeEnum.IMAGE, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
363
+
364
+
365
+ def _construct_text_image_primitive(
366
+ cropped_image: CroppedImageWithContent,
367
+ page_idx: int,
368
+ page_count: int,
369
+ source_metadata: Dict,
370
+ base_unified_metadata: Dict,
371
+ ) -> List[Any]:
372
+ """Constructs an 'image' primitive for a detected text block, intended for downstream OCR."""
373
+ content_metadata = {
374
+ "type": ContentTypeEnum.TEXT,
375
+ "description": ContentDescriptionEnum.PDF_TEXT,
376
+ "page_number": page_idx,
377
+ "hierarchy": {
378
+ "page_count": page_count,
379
+ "page": page_idx,
380
+ },
381
+ "subtype": cropped_image.type_string,
382
+ }
383
+
384
+ text_metadata = {
385
+ "text_type": "page",
386
+ "text_location": cropped_image.bbox,
387
+ "text_location_max_dimensions": (cropped_image.max_width, cropped_image.max_height),
388
+ }
389
+
390
+ unified_metadata = base_unified_metadata.copy()
391
+ unified_metadata.update(
392
+ {
393
+ "content": cropped_image.image, # The base64 image of the text block
394
+ "source_metadata": source_metadata,
395
+ "content_metadata": content_metadata,
396
+ "text_metadata": text_metadata,
397
+ }
398
+ )
399
+
400
+ validated_metadata = validate_metadata(unified_metadata)
401
+ return [ContentTypeEnum.TEXT, validated_metadata.model_dump(), str(uuid.uuid4())]
402
+
403
+
404
+ # TODO(Devin): Disambiguate tables and charts, create two distinct processing methods
405
+ @pdfium_exception_handler(descriptor="pdfium")
406
+ def construct_page_element_metadata(
407
+ structured_image: CroppedImageWithContent,
408
+ page_idx: int,
409
+ page_count: int,
410
+ source_metadata: Dict,
411
+ base_unified_metadata: Dict,
412
+ ):
413
+ """
414
+ +--------------------------------+--------------------------+------------+---+
415
+ | Table/Chart Metadata | | Extracted | Y |
416
+ | (tables within documents) | | | |
417
+ +--------------------------------+--------------------------+------------+---+
418
+ | Table format | Structured (dataframe / | Extracted | |
419
+ | | lists of rows and | | |
420
+ | | columns), or serialized | | |
421
+ | | as markdown, html, | | |
422
+ | | latex, simple (cells | | |
423
+ | | separated just as spaces)| | |
424
+ +--------------------------------+--------------------------+------------+---+
425
+ | Table content | Extracted text content | | |
426
+ | | | | |
427
+ | | Important: Tables should | | |
428
+ | | not be chunked | | |
429
+ +--------------------------------+--------------------------+------------+---+
430
+ | Table location | Bounding box of the table| | |
431
+ +--------------------------------+--------------------------+------------+---+
432
+ | Caption | Detected captions for | | |
433
+ | | the table/chart | | |
434
+ +--------------------------------+--------------------------+------------+---+
435
+ | uploaded_image_uri | Mirrors | | |
436
+ | | source_metadata. | | |
437
+ | | source_location | | |
438
+ +--------------------------------+--------------------------+------------+---+
439
+ """
440
+ text_types = {"paragraph", "title", "header_footer"}
441
+ if structured_image.type_string in text_types:
442
+ return _construct_text_image_primitive(
443
+ structured_image, page_idx, page_count, source_metadata, base_unified_metadata
444
+ )
445
+
446
+ if structured_image.type_string in ("table",):
447
+ content = structured_image.image
448
+ structured_content_text = structured_image.content
449
+ structured_content_format = structured_image.content_format
450
+ table_format = TableFormatEnum.IMAGE
451
+ subtype = ContentTypeEnum.TABLE
452
+ description = ContentDescriptionEnum.PDF_TABLE
453
+ meta_name = "table_metadata"
454
+
455
+ elif structured_image.type_string in ("chart",):
456
+ content = structured_image.image
457
+ structured_content_text = structured_image.content
458
+ structured_content_format = structured_image.content_format
459
+ table_format = TableFormatEnum.IMAGE
460
+ subtype = ContentTypeEnum.CHART
461
+ description = ContentDescriptionEnum.PDF_CHART
462
+ # TODO(Devin) swap this to chart_metadata after we confirm metadata schema changes.
463
+ meta_name = "table_metadata"
464
+
465
+ elif structured_image.type_string in ("infographic",):
466
+ content = structured_image.image
467
+ structured_content_text = structured_image.content
468
+ structured_content_format = structured_image.content_format
469
+ table_format = TableFormatEnum.IMAGE
470
+ subtype = ContentTypeEnum.INFOGRAPHIC
471
+ description = ContentDescriptionEnum.PDF_INFOGRAPHIC
472
+ meta_name = "table_metadata"
473
+
474
+ else:
475
+ raise ValueError(f"Unknown table/chart/infographic type: {structured_image.type_string}")
476
+
477
+ content_metadata = {
478
+ "type": ContentTypeEnum.STRUCTURED,
479
+ "description": description,
480
+ "page_number": page_idx,
481
+ "hierarchy": {
482
+ "page_count": page_count,
483
+ "page": page_idx,
484
+ "line": -1,
485
+ "span": -1,
486
+ },
487
+ "subtype": subtype,
488
+ }
489
+
490
+ structured_metadata = {
491
+ "caption": "",
492
+ "table_format": table_format,
493
+ "table_content": structured_content_text,
494
+ "table_content_format": structured_content_format,
495
+ "table_location": structured_image.bbox,
496
+ "table_location_max_dimensions": (structured_image.max_width, structured_image.max_height),
497
+ }
498
+
499
+ ext_unified_metadata = base_unified_metadata.copy()
500
+
501
+ ext_unified_metadata.update(
502
+ {
503
+ "content": content,
504
+ "source_metadata": source_metadata,
505
+ "content_metadata": content_metadata,
506
+ meta_name: structured_metadata,
507
+ }
508
+ )
509
+
510
+ validated_unified_metadata = validate_metadata(ext_unified_metadata)
511
+
512
+ return [ContentTypeEnum.STRUCTURED, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
513
+
514
+
515
+ # TODO: remove this alias
516
+ construct_table_and_chart_metadata = construct_page_element_metadata
@@ -0,0 +1,8 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ from .mp_pool_singleton import ProcessWorkerPoolSingleton
7
+
8
+ __all__ = ["ProcessWorkerPoolSingleton"]