nv-ingest-api 2025.4.17.dev20250417__py3-none-any.whl → 2025.4.19.dev20250419__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +0 -3
  2. nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
  3. nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/METADATA +1 -1
  5. nv_ingest_api-2025.4.19.dev20250419.dist-info/RECORD +9 -0
  6. {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/WHEEL +1 -1
  7. nv_ingest_api/interface/__init__.py +0 -215
  8. nv_ingest_api/interface/extract.py +0 -972
  9. nv_ingest_api/interface/mutate.py +0 -154
  10. nv_ingest_api/interface/store.py +0 -218
  11. nv_ingest_api/interface/transform.py +0 -382
  12. nv_ingest_api/interface/utility.py +0 -200
  13. nv_ingest_api/internal/enums/__init__.py +0 -3
  14. nv_ingest_api/internal/enums/common.py +0 -494
  15. nv_ingest_api/internal/extract/__init__.py +0 -3
  16. nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  17. nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  18. nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  19. nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  20. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  21. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  22. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  23. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  24. nv_ingest_api/internal/extract/image/__init__.py +0 -3
  25. nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  26. nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  27. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  28. nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  29. nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  30. nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  31. nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  32. nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  33. nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  34. nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  35. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  36. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  37. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  38. nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  39. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  40. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  41. nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  42. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  43. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  44. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  45. nv_ingest_api/internal/mutate/__init__.py +0 -3
  46. nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  47. nv_ingest_api/internal/mutate/filter.py +0 -133
  48. nv_ingest_api/internal/primitives/__init__.py +0 -0
  49. nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  50. nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  51. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  52. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  53. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  54. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  55. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
  56. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  57. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
  58. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  59. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  60. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  61. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  62. nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  63. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  64. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  65. nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  66. nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  67. nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  68. nv_ingest_api/internal/schemas/__init__.py +0 -3
  69. nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  70. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  71. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  72. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  73. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  74. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  75. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  76. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  77. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  78. nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  79. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  80. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  81. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  82. nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  83. nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  84. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  85. nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  86. nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  87. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  88. nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  89. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  90. nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  91. nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  92. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  93. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  94. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  95. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  96. nv_ingest_api/internal/store/__init__.py +0 -3
  97. nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  98. nv_ingest_api/internal/store/image_upload.py +0 -232
  99. nv_ingest_api/internal/transform/__init__.py +0 -3
  100. nv_ingest_api/internal/transform/caption_image.py +0 -205
  101. nv_ingest_api/internal/transform/embed_text.py +0 -496
  102. nv_ingest_api/internal/transform/split_text.py +0 -157
  103. nv_ingest_api/util/__init__.py +0 -0
  104. nv_ingest_api/util/control_message/__init__.py +0 -0
  105. nv_ingest_api/util/control_message/validators.py +0 -47
  106. nv_ingest_api/util/converters/__init__.py +0 -0
  107. nv_ingest_api/util/converters/bytetools.py +0 -78
  108. nv_ingest_api/util/converters/containers.py +0 -65
  109. nv_ingest_api/util/converters/datetools.py +0 -90
  110. nv_ingest_api/util/converters/dftools.py +0 -127
  111. nv_ingest_api/util/converters/formats.py +0 -64
  112. nv_ingest_api/util/converters/type_mappings.py +0 -27
  113. nv_ingest_api/util/detectors/__init__.py +0 -5
  114. nv_ingest_api/util/detectors/language.py +0 -38
  115. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  116. nv_ingest_api/util/exception_handlers/converters.py +0 -72
  117. nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  118. nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  119. nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  120. nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  121. nv_ingest_api/util/image_processing/__init__.py +0 -5
  122. nv_ingest_api/util/image_processing/clustering.py +0 -260
  123. nv_ingest_api/util/image_processing/processing.py +0 -179
  124. nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  125. nv_ingest_api/util/image_processing/transforms.py +0 -407
  126. nv_ingest_api/util/logging/__init__.py +0 -0
  127. nv_ingest_api/util/logging/configuration.py +0 -31
  128. nv_ingest_api/util/message_brokers/__init__.py +0 -3
  129. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  130. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  131. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  132. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -435
  133. nv_ingest_api/util/metadata/__init__.py +0 -5
  134. nv_ingest_api/util/metadata/aggregators.py +0 -469
  135. nv_ingest_api/util/multi_processing/__init__.py +0 -8
  136. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  137. nv_ingest_api/util/nim/__init__.py +0 -56
  138. nv_ingest_api/util/pdf/__init__.py +0 -3
  139. nv_ingest_api/util/pdf/pdfium.py +0 -427
  140. nv_ingest_api/util/schema/__init__.py +0 -0
  141. nv_ingest_api/util/schema/schema_validator.py +0 -10
  142. nv_ingest_api/util/service_clients/__init__.py +0 -3
  143. nv_ingest_api/util/service_clients/client_base.py +0 -72
  144. nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  145. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  146. nv_ingest_api/util/service_clients/redis/redis_client.py +0 -334
  147. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  148. nv_ingest_api/util/service_clients/rest/rest_client.py +0 -398
  149. nv_ingest_api/util/string_processing/__init__.py +0 -51
  150. nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +0 -152
  151. /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
  152. {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.17.dev20250417.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/top_level.txt +0 -0
@@ -1,484 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- # Copyright (c) 2024, NVIDIA CORPORATION.
7
- #
8
- # Licensed under the Apache License, Version 2.0 (the "License");
9
- # you may not use this file except in compliance with the License.
10
- # You may obtain a copy of the License at
11
- #
12
- # http://www.apache.org/licenses/LICENSE-2.0
13
- #
14
- # Unless required by applicable law or agreed to in writing, software
15
- # distributed under the License is distributed on an "AS IS" BASIS,
16
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
- # See the License for the specific language governing permissions and
18
- # limitations under the License.
19
-
20
- import io
21
- import json
22
- import logging
23
- import random
24
- import time
25
- import uuid
26
- import zipfile
27
- from typing import Optional, List, Any
28
-
29
- import pandas as pd
30
- import pypdfium2 as pdfium
31
-
32
- from nv_ingest_api.internal.enums.common import AccessLevelEnum, DocumentTypeEnum
33
- from nv_ingest_api.internal.enums.common import ContentTypeEnum
34
- from nv_ingest_api.internal.enums.common import ContentDescriptionEnum
35
- from nv_ingest_api.internal.enums.common import TableFormatEnum
36
- from nv_ingest_api.internal.enums.common import TextTypeEnum
37
- from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
38
- from nv_ingest_api.util.converters import bytetools
39
- from nv_ingest_api.util.metadata.aggregators import extract_pdf_metadata, construct_text_metadata
40
-
41
- ADOBE_INSTALLED = True
42
- try:
43
- from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
44
- from adobe.pdfservices.operation.exception.exceptions import SdkException
45
- from adobe.pdfservices.operation.exception.exceptions import ServiceApiException
46
- from adobe.pdfservices.operation.exception.exceptions import ServiceUsageException
47
- from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
48
- from adobe.pdfservices.operation.io.stream_asset import StreamAsset
49
- from adobe.pdfservices.operation.pdf_services import PDFServices
50
- from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
51
- from adobe.pdfservices.operation.pdfjobs.jobs.extract_pdf_job import ExtractPDFJob
52
- from adobe.pdfservices.operation.pdfjobs.params.extract_pdf import extract_renditions_element_type
53
- from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_element_type import ExtractElementType
54
- from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_pdf_params import ExtractPDFParams
55
- from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.table_structure_type import TableStructureType
56
- from adobe.pdfservices.operation.pdfjobs.result.extract_pdf_result import ExtractPDFResult
57
-
58
- ExtractRenditionsElementType = (
59
- extract_renditions_element_type.ExtractRenditionsElementType
60
- ) # black / isort conflict
61
- except ImportError:
62
- ADOBE_INSTALLED = False
63
- logger = logging.getLogger(__name__)
64
-
65
-
66
- def adobe_extractor(
67
- pdf_stream: io.BytesIO,
68
- extract_text: bool,
69
- extract_images: bool,
70
- extract_infographics: bool,
71
- extract_tables: bool,
72
- extractor_config: dict,
73
- execution_trace_log: Optional[List[Any]] = None,
74
- ) -> pd.DataFrame:
75
- """
76
- Helper function to use unstructured-io REST API to extract text from a bytestream PDF.
77
-
78
- Parameters
79
- ----------
80
- pdf_stream : io.BytesIO
81
- A bytestream PDF.
82
- extract_text : bool
83
- Specifies whether to extract text.
84
- extract_images : bool
85
- Specifies whether to extract images.
86
- extract_infographics : bool
87
- Specifies whether to extract infographics.
88
- extract_tables : bool
89
- Specifies whether to extract tables.
90
- extractor_config : dict
91
- A dictionary containing additional extraction parameters such as API credentials,
92
- row_data, text_depth, and other optional settings.
93
- execution_trace_log : optional
94
- Trace information for debugging purposes.
95
-
96
- Returns
97
- -------
98
- str
99
- A string of extracted text.
100
-
101
- Raises
102
- ------
103
- RuntimeError
104
- If the Adobe SDK is not installed.
105
- ValueError
106
- If required configuration parameters are missing or invalid.
107
- SDKError
108
- If there is an error during extraction.
109
- """
110
-
111
- # Not used for Adobe extraction, currently.
112
- _ = execution_trace_log
113
- _ = extract_infographics
114
-
115
- logger.debug("Extracting PDF with Adobe backend.")
116
- if not ADOBE_INSTALLED:
117
- err_msg = (
118
- "Adobe SDK not installed -- cannot extract PDF.\r\nTo install the adobe SDK please review the "
119
- "license agreement at https://github.com/adobe/pdfservices-python-sdk?tab=License-1-ov-file and "
120
- "re-launch the nv-ingest microservice with -e INSTALL_ADOBE_SDK=True."
121
- )
122
- logger.error(err_msg)
123
- raise RuntimeError(err_msg)
124
-
125
- # Ensure extractor_config is a dictionary.
126
- if not isinstance(extractor_config, dict):
127
- raise ValueError("extractor_config must be a dictionary.")
128
-
129
- # Retrieve Adobe API keys.
130
- client_id = extractor_config.get("adobe_client_id")
131
- client_secret = extractor_config.get("adobe_client_secret")
132
- if not client_id or not client_secret:
133
- raise ValueError(
134
- "Missing Adobe API credentials in extractor_config (adobe_client_id and adobe_client_secret are required)."
135
- )
136
-
137
- # Get row_data from configuration.
138
- row_data = extractor_config.get("row_data")
139
- if row_data is None:
140
- raise ValueError("Missing 'row_data' in extractor_config.")
141
-
142
- # Retrieve source information.
143
- source_id = row_data.get("source_id")
144
- file_name = row_data.get("id", "_.pdf")
145
-
146
- # Retrieve and validate text_depth.
147
- text_depth_str = extractor_config.get("text_depth", "page")
148
- try:
149
- text_depth = TextTypeEnum[text_depth_str.upper()]
150
- except KeyError:
151
- valid_options = [e.name.lower() for e in TextTypeEnum]
152
- raise ValueError(f"Invalid text_depth value: {text_depth_str}. Expected one of: {valid_options}")
153
-
154
- # Optional settings.
155
- identify_nearby_objects = extractor_config.get("identify_nearby_objects", True)
156
- metadata_col = extractor_config.get("metadata_column", "metadata")
157
- if hasattr(row_data, "index"):
158
- base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}
159
- else:
160
- base_unified_metadata = row_data.get(metadata_col, {})
161
-
162
- # get base source_metadata
163
- base_source_metadata = base_unified_metadata.get("source_metadata", {})
164
- # get source_location
165
- source_location = base_source_metadata.get("source_location", "")
166
- # get collection_id (assuming coming in from source_metadata...)
167
- collection_id = base_source_metadata.get("collection_id", "")
168
- # get partition_id (assuming coming in from source_metadata...)
169
- partition_id = base_source_metadata.get("partition_id", -1)
170
- # get access_level (assuming coming in from source_metadata...)
171
- access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
172
-
173
- source_metadata = {
174
- "source_name": file_name,
175
- "source_id": source_id,
176
- "source_location": source_location,
177
- "collection_id": collection_id,
178
- "summary": "",
179
- "partition_id": partition_id,
180
- "access_level": access_level,
181
- }
182
-
183
- doc = pdfium.PdfDocument(pdf_stream)
184
- pdf_metadata = extract_pdf_metadata(doc, source_id)
185
-
186
- document_metadata = {
187
- "source_type": pdf_metadata.source_type,
188
- "date_created": pdf_metadata.date_created,
189
- "last_modified": pdf_metadata.last_modified,
190
- }
191
-
192
- source_metadata.update(document_metadata)
193
-
194
- retry_delay = 1
195
- max_delay = 50
196
- while True:
197
- try:
198
- # Initial setup, create credentials instance
199
- credentials = ServicePrincipalCredentials(
200
- client_id=client_id,
201
- client_secret=client_secret,
202
- )
203
-
204
- # Creates a PDF Services instance
205
- pdf_services = PDFServices(credentials=credentials)
206
-
207
- # Creates an asset(s) from source file(s) and upload
208
- input_asset = pdf_services.upload(input_stream=pdf_stream, mime_type=PDFServicesMediaType.PDF)
209
-
210
- # Create parameters for the job
211
- elements_to_extract = []
212
- if extract_text:
213
- elements_to_extract.append(ExtractElementType.TEXT)
214
- if extract_tables:
215
- elements_to_extract.append(ExtractElementType.TABLES)
216
-
217
- extract_pdf_params = ExtractPDFParams(
218
- table_structure_type=TableStructureType.CSV,
219
- elements_to_extract=elements_to_extract,
220
- elements_to_extract_renditions=[ExtractRenditionsElementType.FIGURES] if extract_images else [],
221
- )
222
-
223
- # Creates a new job instance
224
- extract_pdf_job = ExtractPDFJob(input_asset=input_asset, extract_pdf_params=extract_pdf_params)
225
-
226
- # Submit the job and gets the job result
227
- location = pdf_services.submit(extract_pdf_job)
228
- pdf_services_response = pdf_services.get_job_result(location, ExtractPDFResult)
229
-
230
- # Get content from the resulting asset(s)
231
- result_asset: CloudAsset = pdf_services_response.get_result().get_resource()
232
- stream_asset: StreamAsset = pdf_services.get_content(result_asset)
233
-
234
- archive = zipfile.ZipFile(io.BytesIO(stream_asset.get_input_stream()))
235
- jsonentry = archive.open("structuredData.json")
236
- jsondata = jsonentry.read()
237
- data = json.loads(jsondata)
238
-
239
- # Request successful
240
- break
241
-
242
- except (ServiceApiException, ServiceUsageException, SdkException) as e:
243
- if isinstance(e, ServiceUsageException) and (retry_delay * 1.1) < max_delay:
244
- time.sleep(retry_delay)
245
- retry_delay *= 1.1
246
- retry_delay += random.uniform(0, 1)
247
- logging.error(f"Exception encountered while executing operation: {e}, retrying in {int(retry_delay)}s.")
248
- else:
249
- logging.exception(f"Exception encountered while executing operation: {e}")
250
- return []
251
-
252
- extracted_data = []
253
- accumulated_text = []
254
- page_idx = 0
255
-
256
- page_nearby_blocks = {
257
- "text": {"content": [], "bbox": []},
258
- "images": {"content": [], "bbox": []},
259
- "structured": {"content": [], "bbox": []},
260
- }
261
-
262
- for block_idx, item in enumerate(data["elements"]):
263
- # Extract text
264
- if extract_text and "Text" in item and "Table" not in item["Path"] and "Figure" not in item["Path"]:
265
- if item["Page"] != page_idx:
266
- if text_depth == TextTypeEnum.PAGE:
267
- text_extraction = construct_text_metadata(
268
- accumulated_text,
269
- pdf_metadata.page_count,
270
- page_idx,
271
- block_idx,
272
- text_depth,
273
- source_metadata,
274
- base_unified_metadata,
275
- bbox=(0, 0, data["pages"][page_idx]["width"], data["pages"][page_idx]["height"]),
276
- )
277
-
278
- if len(text_extraction) > 0:
279
- extracted_data.append(text_extraction)
280
-
281
- accumulated_text = []
282
-
283
- page_nearby_blocks = {
284
- "text": {"content": [], "bbox": []},
285
- "images": {"content": [], "bbox": []},
286
- "structured": {"content": [], "bbox": []},
287
- }
288
- page_idx = item["Page"]
289
-
290
- accumulated_text.append(item["Text"].strip())
291
-
292
- if text_depth == TextTypeEnum.BLOCK:
293
- bounds = item["Bounds"]
294
-
295
- text_extraction = construct_text_metadata(
296
- accumulated_text,
297
- pdf_metadata.page_count,
298
- item["Page"],
299
- block_idx,
300
- text_depth,
301
- source_metadata,
302
- base_unified_metadata,
303
- bbox=(bounds[0], bounds[1], bounds[2], bounds[3]),
304
- )
305
-
306
- if len(text_extraction) > 0:
307
- extracted_data.append(text_extraction)
308
-
309
- accumulated_text = []
310
-
311
- if (extract_images and identify_nearby_objects) and (len(item["Text"]) > 0):
312
- bounds = item["Bounds"]
313
- page_nearby_blocks["text"]["content"].append(" ".join(item["Text"].strip()))
314
- page_nearby_blocks["text"]["bbox"].append((bounds[0], bounds[1], bounds[2], bounds[3]))
315
-
316
- # Extract images
317
- if extract_images and item["Path"].endswith("/Figure"):
318
- bounds = item["Bounds"]
319
-
320
- try:
321
- figure = archive.open(item["filePaths"][0])
322
- base64_img = bytetools.base64frombytes(figure.read())
323
- except KeyError:
324
- base64_img = ""
325
-
326
- image_extraction = _construct_image_metadata(
327
- base64_img,
328
- item.get("Text", ""),
329
- pdf_metadata.page_count,
330
- item["Page"],
331
- block_idx,
332
- source_metadata,
333
- base_unified_metadata,
334
- page_nearby_blocks,
335
- bbox=(bounds[0], bounds[1], bounds[2], bounds[3]),
336
- )
337
-
338
- extracted_data.append(image_extraction)
339
-
340
- # Extract tables
341
- if extract_tables and item["Path"].endswith("/Table"):
342
- bounds = item["Bounds"]
343
-
344
- try:
345
- df = pd.read_csv(archive.open(item["filePaths"][0]), delimiter=",")
346
- except KeyError:
347
- df = pd.DataFrame()
348
-
349
- table_extraction = _construct_table_metadata(
350
- df.to_markdown(),
351
- pdf_metadata.page_count,
352
- item["Page"],
353
- block_idx,
354
- source_metadata,
355
- base_unified_metadata,
356
- bbox=(bounds[0], bounds[1], bounds[2], bounds[3]),
357
- )
358
-
359
- extracted_data.append(table_extraction)
360
-
361
- if text_depth == TextTypeEnum.PAGE:
362
- text_extraction = construct_text_metadata(
363
- accumulated_text,
364
- pdf_metadata.page_count,
365
- page_idx,
366
- block_idx,
367
- text_depth,
368
- source_metadata,
369
- base_unified_metadata,
370
- # bbox=(0, 0, data["pages"][page_idx]["width"], data["pages"][page_idx]["height"]),
371
- )
372
-
373
- if len(text_extraction) > 0:
374
- extracted_data.append(text_extraction)
375
-
376
- if extract_text and text_depth == TextTypeEnum.DOCUMENT:
377
- text_extraction = construct_text_metadata(
378
- accumulated_text,
379
- pdf_metadata.page_count,
380
- -1,
381
- -1,
382
- text_depth,
383
- source_metadata,
384
- base_unified_metadata,
385
- )
386
-
387
- if len(text_extraction) > 0:
388
- extracted_data.append(text_extraction)
389
-
390
- return extracted_data
391
-
392
-
393
- def _construct_image_metadata(
394
- image,
395
- image_text,
396
- page_count,
397
- page_idx,
398
- block_idx,
399
- source_metadata,
400
- base_unified_metadata,
401
- page_nearby_blocks,
402
- bbox,
403
- ):
404
- content_metadata = {
405
- "type": ContentTypeEnum.IMAGE,
406
- "description": ContentDescriptionEnum.PDF_IMAGE,
407
- "page_number": page_idx,
408
- "hierarchy": {
409
- "page_count": page_count,
410
- "page": page_idx,
411
- "block": block_idx,
412
- "line": -1,
413
- "span": -1,
414
- "nearby_objects": page_nearby_blocks,
415
- },
416
- }
417
-
418
- image_metadata = {
419
- "image_type": DocumentTypeEnum.PNG,
420
- "caption": "",
421
- "text": image_text,
422
- "image_location": bbox,
423
- "width": bbox[2] - bbox[0],
424
- "height": bbox[3] - bbox[1],
425
- }
426
-
427
- unified_metadata = base_unified_metadata.copy()
428
-
429
- unified_metadata.update(
430
- {
431
- "content": image,
432
- "source_metadata": source_metadata,
433
- "content_metadata": content_metadata,
434
- "image_metadata": image_metadata,
435
- }
436
- )
437
-
438
- validated_unified_metadata = validate_metadata(unified_metadata)
439
-
440
- return [ContentTypeEnum.IMAGE.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
441
-
442
-
443
- def _construct_table_metadata(
444
- table,
445
- page_count,
446
- page_idx,
447
- block_idx,
448
- source_metadata,
449
- base_unified_metadata,
450
- bbox,
451
- ):
452
- content_metadata = {
453
- "type": ContentTypeEnum.STRUCTURED,
454
- "description": ContentDescriptionEnum.PDF_TABLE,
455
- "page_number": page_idx,
456
- "hierarchy": {
457
- "page_count": page_count,
458
- "page": page_idx,
459
- "block": block_idx,
460
- "line": -1,
461
- "span": -1,
462
- },
463
- }
464
-
465
- table_metadata = {
466
- "caption": "",
467
- "table_format": TableFormatEnum.MARKDOWN,
468
- "table_location": bbox,
469
- }
470
-
471
- unified_metadata = base_unified_metadata.copy()
472
-
473
- unified_metadata.update(
474
- {
475
- "content": table,
476
- "source_metadata": source_metadata,
477
- "content_metadata": content_metadata,
478
- "table_metadata": table_metadata,
479
- }
480
- )
481
-
482
- validated_unified_metadata = validate_metadata(unified_metadata)
483
-
484
- return [ContentTypeEnum.STRUCTURED.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())]