nv-ingest-api 2025.4.18.dev20250418__py3-none-any.whl → 2025.4.19.dev20250419__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +0 -3
- nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
- nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.19.dev20250419.dist-info/RECORD +9 -0
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/WHEEL +1 -1
- nv_ingest_api/interface/__init__.py +0 -215
- nv_ingest_api/interface/extract.py +0 -972
- nv_ingest_api/interface/mutate.py +0 -154
- nv_ingest_api/interface/store.py +0 -218
- nv_ingest_api/interface/transform.py +0 -382
- nv_ingest_api/interface/utility.py +0 -200
- nv_ingest_api/internal/enums/__init__.py +0 -3
- nv_ingest_api/internal/enums/common.py +0 -494
- nv_ingest_api/internal/extract/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
- nv_ingest_api/internal/extract/docx/__init__.py +0 -5
- nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
- nv_ingest_api/internal/extract/image/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
- nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
- nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
- nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
- nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
- nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
- nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
- nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
- nv_ingest_api/internal/mutate/__init__.py +0 -3
- nv_ingest_api/internal/mutate/deduplicate.py +0 -110
- nv_ingest_api/internal/mutate/filter.py +0 -133
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
- nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
- nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
- nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
- nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
- nv_ingest_api/internal/schemas/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
- nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
- nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
- nv_ingest_api/internal/schemas/store/__init__.py +0 -3
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
- nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
- nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
- nv_ingest_api/internal/store/__init__.py +0 -3
- nv_ingest_api/internal/store/embed_text_upload.py +0 -236
- nv_ingest_api/internal/store/image_upload.py +0 -232
- nv_ingest_api/internal/transform/__init__.py +0 -3
- nv_ingest_api/internal/transform/caption_image.py +0 -205
- nv_ingest_api/internal/transform/embed_text.py +0 -496
- nv_ingest_api/internal/transform/split_text.py +0 -157
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +0 -47
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +0 -78
- nv_ingest_api/util/converters/containers.py +0 -65
- nv_ingest_api/util/converters/datetools.py +0 -90
- nv_ingest_api/util/converters/dftools.py +0 -127
- nv_ingest_api/util/converters/formats.py +0 -64
- nv_ingest_api/util/converters/type_mappings.py +0 -27
- nv_ingest_api/util/detectors/__init__.py +0 -5
- nv_ingest_api/util/detectors/language.py +0 -38
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +0 -72
- nv_ingest_api/util/exception_handlers/decorators.py +0 -223
- nv_ingest_api/util/exception_handlers/detectors.py +0 -74
- nv_ingest_api/util/exception_handlers/pdf.py +0 -116
- nv_ingest_api/util/exception_handlers/schemas.py +0 -68
- nv_ingest_api/util/image_processing/__init__.py +0 -5
- nv_ingest_api/util/image_processing/clustering.py +0 -260
- nv_ingest_api/util/image_processing/processing.py +0 -179
- nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
- nv_ingest_api/util/image_processing/transforms.py +0 -407
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +0 -31
- nv_ingest_api/util/message_brokers/__init__.py +0 -3
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -451
- nv_ingest_api/util/metadata/__init__.py +0 -5
- nv_ingest_api/util/metadata/aggregators.py +0 -469
- nv_ingest_api/util/multi_processing/__init__.py +0 -8
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
- nv_ingest_api/util/nim/__init__.py +0 -56
- nv_ingest_api/util/pdf/__init__.py +0 -3
- nv_ingest_api/util/pdf/pdfium.py +0 -427
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +0 -10
- nv_ingest_api/util/service_clients/__init__.py +0 -3
- nv_ingest_api/util/service_clients/client_base.py +0 -86
- nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +0 -823
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +0 -531
- nv_ingest_api/util/string_processing/__init__.py +0 -51
- nv_ingest_api-2025.4.18.dev20250418.dist-info/RECORD +0 -152
- /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/top_level.txt +0 -0
|
@@ -1,484 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
# Copyright (c) 2024, NVIDIA CORPORATION.
|
|
7
|
-
#
|
|
8
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
9
|
-
# you may not use this file except in compliance with the License.
|
|
10
|
-
# You may obtain a copy of the License at
|
|
11
|
-
#
|
|
12
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
13
|
-
#
|
|
14
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
15
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
16
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
17
|
-
# See the License for the specific language governing permissions and
|
|
18
|
-
# limitations under the License.
|
|
19
|
-
|
|
20
|
-
import io
|
|
21
|
-
import json
|
|
22
|
-
import logging
|
|
23
|
-
import random
|
|
24
|
-
import time
|
|
25
|
-
import uuid
|
|
26
|
-
import zipfile
|
|
27
|
-
from typing import Optional, List, Any
|
|
28
|
-
|
|
29
|
-
import pandas as pd
|
|
30
|
-
import pypdfium2 as pdfium
|
|
31
|
-
|
|
32
|
-
from nv_ingest_api.internal.enums.common import AccessLevelEnum, DocumentTypeEnum
|
|
33
|
-
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
34
|
-
from nv_ingest_api.internal.enums.common import ContentDescriptionEnum
|
|
35
|
-
from nv_ingest_api.internal.enums.common import TableFormatEnum
|
|
36
|
-
from nv_ingest_api.internal.enums.common import TextTypeEnum
|
|
37
|
-
from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
|
|
38
|
-
from nv_ingest_api.util.converters import bytetools
|
|
39
|
-
from nv_ingest_api.util.metadata.aggregators import extract_pdf_metadata, construct_text_metadata
|
|
40
|
-
|
|
41
|
-
ADOBE_INSTALLED = True
|
|
42
|
-
try:
|
|
43
|
-
from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
|
|
44
|
-
from adobe.pdfservices.operation.exception.exceptions import SdkException
|
|
45
|
-
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException
|
|
46
|
-
from adobe.pdfservices.operation.exception.exceptions import ServiceUsageException
|
|
47
|
-
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
|
|
48
|
-
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
|
|
49
|
-
from adobe.pdfservices.operation.pdf_services import PDFServices
|
|
50
|
-
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
|
|
51
|
-
from adobe.pdfservices.operation.pdfjobs.jobs.extract_pdf_job import ExtractPDFJob
|
|
52
|
-
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf import extract_renditions_element_type
|
|
53
|
-
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_element_type import ExtractElementType
|
|
54
|
-
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_pdf_params import ExtractPDFParams
|
|
55
|
-
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.table_structure_type import TableStructureType
|
|
56
|
-
from adobe.pdfservices.operation.pdfjobs.result.extract_pdf_result import ExtractPDFResult
|
|
57
|
-
|
|
58
|
-
ExtractRenditionsElementType = (
|
|
59
|
-
extract_renditions_element_type.ExtractRenditionsElementType
|
|
60
|
-
) # black / isort conflict
|
|
61
|
-
except ImportError:
|
|
62
|
-
ADOBE_INSTALLED = False
|
|
63
|
-
logger = logging.getLogger(__name__)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def adobe_extractor(
|
|
67
|
-
pdf_stream: io.BytesIO,
|
|
68
|
-
extract_text: bool,
|
|
69
|
-
extract_images: bool,
|
|
70
|
-
extract_infographics: bool,
|
|
71
|
-
extract_tables: bool,
|
|
72
|
-
extractor_config: dict,
|
|
73
|
-
execution_trace_log: Optional[List[Any]] = None,
|
|
74
|
-
) -> pd.DataFrame:
|
|
75
|
-
"""
|
|
76
|
-
Helper function to use unstructured-io REST API to extract text from a bytestream PDF.
|
|
77
|
-
|
|
78
|
-
Parameters
|
|
79
|
-
----------
|
|
80
|
-
pdf_stream : io.BytesIO
|
|
81
|
-
A bytestream PDF.
|
|
82
|
-
extract_text : bool
|
|
83
|
-
Specifies whether to extract text.
|
|
84
|
-
extract_images : bool
|
|
85
|
-
Specifies whether to extract images.
|
|
86
|
-
extract_infographics : bool
|
|
87
|
-
Specifies whether to extract infographics.
|
|
88
|
-
extract_tables : bool
|
|
89
|
-
Specifies whether to extract tables.
|
|
90
|
-
extractor_config : dict
|
|
91
|
-
A dictionary containing additional extraction parameters such as API credentials,
|
|
92
|
-
row_data, text_depth, and other optional settings.
|
|
93
|
-
execution_trace_log : optional
|
|
94
|
-
Trace information for debugging purposes.
|
|
95
|
-
|
|
96
|
-
Returns
|
|
97
|
-
-------
|
|
98
|
-
str
|
|
99
|
-
A string of extracted text.
|
|
100
|
-
|
|
101
|
-
Raises
|
|
102
|
-
------
|
|
103
|
-
RuntimeError
|
|
104
|
-
If the Adobe SDK is not installed.
|
|
105
|
-
ValueError
|
|
106
|
-
If required configuration parameters are missing or invalid.
|
|
107
|
-
SDKError
|
|
108
|
-
If there is an error during extraction.
|
|
109
|
-
"""
|
|
110
|
-
|
|
111
|
-
# Not used for Adobe extraction, currently.
|
|
112
|
-
_ = execution_trace_log
|
|
113
|
-
_ = extract_infographics
|
|
114
|
-
|
|
115
|
-
logger.debug("Extracting PDF with Adobe backend.")
|
|
116
|
-
if not ADOBE_INSTALLED:
|
|
117
|
-
err_msg = (
|
|
118
|
-
"Adobe SDK not installed -- cannot extract PDF.\r\nTo install the adobe SDK please review the "
|
|
119
|
-
"license agreement at https://github.com/adobe/pdfservices-python-sdk?tab=License-1-ov-file and "
|
|
120
|
-
"re-launch the nv-ingest microservice with -e INSTALL_ADOBE_SDK=True."
|
|
121
|
-
)
|
|
122
|
-
logger.error(err_msg)
|
|
123
|
-
raise RuntimeError(err_msg)
|
|
124
|
-
|
|
125
|
-
# Ensure extractor_config is a dictionary.
|
|
126
|
-
if not isinstance(extractor_config, dict):
|
|
127
|
-
raise ValueError("extractor_config must be a dictionary.")
|
|
128
|
-
|
|
129
|
-
# Retrieve Adobe API keys.
|
|
130
|
-
client_id = extractor_config.get("adobe_client_id")
|
|
131
|
-
client_secret = extractor_config.get("adobe_client_secret")
|
|
132
|
-
if not client_id or not client_secret:
|
|
133
|
-
raise ValueError(
|
|
134
|
-
"Missing Adobe API credentials in extractor_config (adobe_client_id and adobe_client_secret are required)."
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
# Get row_data from configuration.
|
|
138
|
-
row_data = extractor_config.get("row_data")
|
|
139
|
-
if row_data is None:
|
|
140
|
-
raise ValueError("Missing 'row_data' in extractor_config.")
|
|
141
|
-
|
|
142
|
-
# Retrieve source information.
|
|
143
|
-
source_id = row_data.get("source_id")
|
|
144
|
-
file_name = row_data.get("id", "_.pdf")
|
|
145
|
-
|
|
146
|
-
# Retrieve and validate text_depth.
|
|
147
|
-
text_depth_str = extractor_config.get("text_depth", "page")
|
|
148
|
-
try:
|
|
149
|
-
text_depth = TextTypeEnum[text_depth_str.upper()]
|
|
150
|
-
except KeyError:
|
|
151
|
-
valid_options = [e.name.lower() for e in TextTypeEnum]
|
|
152
|
-
raise ValueError(f"Invalid text_depth value: {text_depth_str}. Expected one of: {valid_options}")
|
|
153
|
-
|
|
154
|
-
# Optional settings.
|
|
155
|
-
identify_nearby_objects = extractor_config.get("identify_nearby_objects", True)
|
|
156
|
-
metadata_col = extractor_config.get("metadata_column", "metadata")
|
|
157
|
-
if hasattr(row_data, "index"):
|
|
158
|
-
base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}
|
|
159
|
-
else:
|
|
160
|
-
base_unified_metadata = row_data.get(metadata_col, {})
|
|
161
|
-
|
|
162
|
-
# get base source_metadata
|
|
163
|
-
base_source_metadata = base_unified_metadata.get("source_metadata", {})
|
|
164
|
-
# get source_location
|
|
165
|
-
source_location = base_source_metadata.get("source_location", "")
|
|
166
|
-
# get collection_id (assuming coming in from source_metadata...)
|
|
167
|
-
collection_id = base_source_metadata.get("collection_id", "")
|
|
168
|
-
# get partition_id (assuming coming in from source_metadata...)
|
|
169
|
-
partition_id = base_source_metadata.get("partition_id", -1)
|
|
170
|
-
# get access_level (assuming coming in from source_metadata...)
|
|
171
|
-
access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
|
|
172
|
-
|
|
173
|
-
source_metadata = {
|
|
174
|
-
"source_name": file_name,
|
|
175
|
-
"source_id": source_id,
|
|
176
|
-
"source_location": source_location,
|
|
177
|
-
"collection_id": collection_id,
|
|
178
|
-
"summary": "",
|
|
179
|
-
"partition_id": partition_id,
|
|
180
|
-
"access_level": access_level,
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
doc = pdfium.PdfDocument(pdf_stream)
|
|
184
|
-
pdf_metadata = extract_pdf_metadata(doc, source_id)
|
|
185
|
-
|
|
186
|
-
document_metadata = {
|
|
187
|
-
"source_type": pdf_metadata.source_type,
|
|
188
|
-
"date_created": pdf_metadata.date_created,
|
|
189
|
-
"last_modified": pdf_metadata.last_modified,
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
source_metadata.update(document_metadata)
|
|
193
|
-
|
|
194
|
-
retry_delay = 1
|
|
195
|
-
max_delay = 50
|
|
196
|
-
while True:
|
|
197
|
-
try:
|
|
198
|
-
# Initial setup, create credentials instance
|
|
199
|
-
credentials = ServicePrincipalCredentials(
|
|
200
|
-
client_id=client_id,
|
|
201
|
-
client_secret=client_secret,
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
# Creates a PDF Services instance
|
|
205
|
-
pdf_services = PDFServices(credentials=credentials)
|
|
206
|
-
|
|
207
|
-
# Creates an asset(s) from source file(s) and upload
|
|
208
|
-
input_asset = pdf_services.upload(input_stream=pdf_stream, mime_type=PDFServicesMediaType.PDF)
|
|
209
|
-
|
|
210
|
-
# Create parameters for the job
|
|
211
|
-
elements_to_extract = []
|
|
212
|
-
if extract_text:
|
|
213
|
-
elements_to_extract.append(ExtractElementType.TEXT)
|
|
214
|
-
if extract_tables:
|
|
215
|
-
elements_to_extract.append(ExtractElementType.TABLES)
|
|
216
|
-
|
|
217
|
-
extract_pdf_params = ExtractPDFParams(
|
|
218
|
-
table_structure_type=TableStructureType.CSV,
|
|
219
|
-
elements_to_extract=elements_to_extract,
|
|
220
|
-
elements_to_extract_renditions=[ExtractRenditionsElementType.FIGURES] if extract_images else [],
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
# Creates a new job instance
|
|
224
|
-
extract_pdf_job = ExtractPDFJob(input_asset=input_asset, extract_pdf_params=extract_pdf_params)
|
|
225
|
-
|
|
226
|
-
# Submit the job and gets the job result
|
|
227
|
-
location = pdf_services.submit(extract_pdf_job)
|
|
228
|
-
pdf_services_response = pdf_services.get_job_result(location, ExtractPDFResult)
|
|
229
|
-
|
|
230
|
-
# Get content from the resulting asset(s)
|
|
231
|
-
result_asset: CloudAsset = pdf_services_response.get_result().get_resource()
|
|
232
|
-
stream_asset: StreamAsset = pdf_services.get_content(result_asset)
|
|
233
|
-
|
|
234
|
-
archive = zipfile.ZipFile(io.BytesIO(stream_asset.get_input_stream()))
|
|
235
|
-
jsonentry = archive.open("structuredData.json")
|
|
236
|
-
jsondata = jsonentry.read()
|
|
237
|
-
data = json.loads(jsondata)
|
|
238
|
-
|
|
239
|
-
# Request successful
|
|
240
|
-
break
|
|
241
|
-
|
|
242
|
-
except (ServiceApiException, ServiceUsageException, SdkException) as e:
|
|
243
|
-
if isinstance(e, ServiceUsageException) and (retry_delay * 1.1) < max_delay:
|
|
244
|
-
time.sleep(retry_delay)
|
|
245
|
-
retry_delay *= 1.1
|
|
246
|
-
retry_delay += random.uniform(0, 1)
|
|
247
|
-
logging.error(f"Exception encountered while executing operation: {e}, retrying in {int(retry_delay)}s.")
|
|
248
|
-
else:
|
|
249
|
-
logging.exception(f"Exception encountered while executing operation: {e}")
|
|
250
|
-
return []
|
|
251
|
-
|
|
252
|
-
extracted_data = []
|
|
253
|
-
accumulated_text = []
|
|
254
|
-
page_idx = 0
|
|
255
|
-
|
|
256
|
-
page_nearby_blocks = {
|
|
257
|
-
"text": {"content": [], "bbox": []},
|
|
258
|
-
"images": {"content": [], "bbox": []},
|
|
259
|
-
"structured": {"content": [], "bbox": []},
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
for block_idx, item in enumerate(data["elements"]):
|
|
263
|
-
# Extract text
|
|
264
|
-
if extract_text and "Text" in item and "Table" not in item["Path"] and "Figure" not in item["Path"]:
|
|
265
|
-
if item["Page"] != page_idx:
|
|
266
|
-
if text_depth == TextTypeEnum.PAGE:
|
|
267
|
-
text_extraction = construct_text_metadata(
|
|
268
|
-
accumulated_text,
|
|
269
|
-
pdf_metadata.page_count,
|
|
270
|
-
page_idx,
|
|
271
|
-
block_idx,
|
|
272
|
-
text_depth,
|
|
273
|
-
source_metadata,
|
|
274
|
-
base_unified_metadata,
|
|
275
|
-
bbox=(0, 0, data["pages"][page_idx]["width"], data["pages"][page_idx]["height"]),
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
if len(text_extraction) > 0:
|
|
279
|
-
extracted_data.append(text_extraction)
|
|
280
|
-
|
|
281
|
-
accumulated_text = []
|
|
282
|
-
|
|
283
|
-
page_nearby_blocks = {
|
|
284
|
-
"text": {"content": [], "bbox": []},
|
|
285
|
-
"images": {"content": [], "bbox": []},
|
|
286
|
-
"structured": {"content": [], "bbox": []},
|
|
287
|
-
}
|
|
288
|
-
page_idx = item["Page"]
|
|
289
|
-
|
|
290
|
-
accumulated_text.append(item["Text"].strip())
|
|
291
|
-
|
|
292
|
-
if text_depth == TextTypeEnum.BLOCK:
|
|
293
|
-
bounds = item["Bounds"]
|
|
294
|
-
|
|
295
|
-
text_extraction = construct_text_metadata(
|
|
296
|
-
accumulated_text,
|
|
297
|
-
pdf_metadata.page_count,
|
|
298
|
-
item["Page"],
|
|
299
|
-
block_idx,
|
|
300
|
-
text_depth,
|
|
301
|
-
source_metadata,
|
|
302
|
-
base_unified_metadata,
|
|
303
|
-
bbox=(bounds[0], bounds[1], bounds[2], bounds[3]),
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
if len(text_extraction) > 0:
|
|
307
|
-
extracted_data.append(text_extraction)
|
|
308
|
-
|
|
309
|
-
accumulated_text = []
|
|
310
|
-
|
|
311
|
-
if (extract_images and identify_nearby_objects) and (len(item["Text"]) > 0):
|
|
312
|
-
bounds = item["Bounds"]
|
|
313
|
-
page_nearby_blocks["text"]["content"].append(" ".join(item["Text"].strip()))
|
|
314
|
-
page_nearby_blocks["text"]["bbox"].append((bounds[0], bounds[1], bounds[2], bounds[3]))
|
|
315
|
-
|
|
316
|
-
# Extract images
|
|
317
|
-
if extract_images and item["Path"].endswith("/Figure"):
|
|
318
|
-
bounds = item["Bounds"]
|
|
319
|
-
|
|
320
|
-
try:
|
|
321
|
-
figure = archive.open(item["filePaths"][0])
|
|
322
|
-
base64_img = bytetools.base64frombytes(figure.read())
|
|
323
|
-
except KeyError:
|
|
324
|
-
base64_img = ""
|
|
325
|
-
|
|
326
|
-
image_extraction = _construct_image_metadata(
|
|
327
|
-
base64_img,
|
|
328
|
-
item.get("Text", ""),
|
|
329
|
-
pdf_metadata.page_count,
|
|
330
|
-
item["Page"],
|
|
331
|
-
block_idx,
|
|
332
|
-
source_metadata,
|
|
333
|
-
base_unified_metadata,
|
|
334
|
-
page_nearby_blocks,
|
|
335
|
-
bbox=(bounds[0], bounds[1], bounds[2], bounds[3]),
|
|
336
|
-
)
|
|
337
|
-
|
|
338
|
-
extracted_data.append(image_extraction)
|
|
339
|
-
|
|
340
|
-
# Extract tables
|
|
341
|
-
if extract_tables and item["Path"].endswith("/Table"):
|
|
342
|
-
bounds = item["Bounds"]
|
|
343
|
-
|
|
344
|
-
try:
|
|
345
|
-
df = pd.read_csv(archive.open(item["filePaths"][0]), delimiter=",")
|
|
346
|
-
except KeyError:
|
|
347
|
-
df = pd.DataFrame()
|
|
348
|
-
|
|
349
|
-
table_extraction = _construct_table_metadata(
|
|
350
|
-
df.to_markdown(),
|
|
351
|
-
pdf_metadata.page_count,
|
|
352
|
-
item["Page"],
|
|
353
|
-
block_idx,
|
|
354
|
-
source_metadata,
|
|
355
|
-
base_unified_metadata,
|
|
356
|
-
bbox=(bounds[0], bounds[1], bounds[2], bounds[3]),
|
|
357
|
-
)
|
|
358
|
-
|
|
359
|
-
extracted_data.append(table_extraction)
|
|
360
|
-
|
|
361
|
-
if text_depth == TextTypeEnum.PAGE:
|
|
362
|
-
text_extraction = construct_text_metadata(
|
|
363
|
-
accumulated_text,
|
|
364
|
-
pdf_metadata.page_count,
|
|
365
|
-
page_idx,
|
|
366
|
-
block_idx,
|
|
367
|
-
text_depth,
|
|
368
|
-
source_metadata,
|
|
369
|
-
base_unified_metadata,
|
|
370
|
-
# bbox=(0, 0, data["pages"][page_idx]["width"], data["pages"][page_idx]["height"]),
|
|
371
|
-
)
|
|
372
|
-
|
|
373
|
-
if len(text_extraction) > 0:
|
|
374
|
-
extracted_data.append(text_extraction)
|
|
375
|
-
|
|
376
|
-
if extract_text and text_depth == TextTypeEnum.DOCUMENT:
|
|
377
|
-
text_extraction = construct_text_metadata(
|
|
378
|
-
accumulated_text,
|
|
379
|
-
pdf_metadata.page_count,
|
|
380
|
-
-1,
|
|
381
|
-
-1,
|
|
382
|
-
text_depth,
|
|
383
|
-
source_metadata,
|
|
384
|
-
base_unified_metadata,
|
|
385
|
-
)
|
|
386
|
-
|
|
387
|
-
if len(text_extraction) > 0:
|
|
388
|
-
extracted_data.append(text_extraction)
|
|
389
|
-
|
|
390
|
-
return extracted_data
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
def _construct_image_metadata(
|
|
394
|
-
image,
|
|
395
|
-
image_text,
|
|
396
|
-
page_count,
|
|
397
|
-
page_idx,
|
|
398
|
-
block_idx,
|
|
399
|
-
source_metadata,
|
|
400
|
-
base_unified_metadata,
|
|
401
|
-
page_nearby_blocks,
|
|
402
|
-
bbox,
|
|
403
|
-
):
|
|
404
|
-
content_metadata = {
|
|
405
|
-
"type": ContentTypeEnum.IMAGE,
|
|
406
|
-
"description": ContentDescriptionEnum.PDF_IMAGE,
|
|
407
|
-
"page_number": page_idx,
|
|
408
|
-
"hierarchy": {
|
|
409
|
-
"page_count": page_count,
|
|
410
|
-
"page": page_idx,
|
|
411
|
-
"block": block_idx,
|
|
412
|
-
"line": -1,
|
|
413
|
-
"span": -1,
|
|
414
|
-
"nearby_objects": page_nearby_blocks,
|
|
415
|
-
},
|
|
416
|
-
}
|
|
417
|
-
|
|
418
|
-
image_metadata = {
|
|
419
|
-
"image_type": DocumentTypeEnum.PNG,
|
|
420
|
-
"caption": "",
|
|
421
|
-
"text": image_text,
|
|
422
|
-
"image_location": bbox,
|
|
423
|
-
"width": bbox[2] - bbox[0],
|
|
424
|
-
"height": bbox[3] - bbox[1],
|
|
425
|
-
}
|
|
426
|
-
|
|
427
|
-
unified_metadata = base_unified_metadata.copy()
|
|
428
|
-
|
|
429
|
-
unified_metadata.update(
|
|
430
|
-
{
|
|
431
|
-
"content": image,
|
|
432
|
-
"source_metadata": source_metadata,
|
|
433
|
-
"content_metadata": content_metadata,
|
|
434
|
-
"image_metadata": image_metadata,
|
|
435
|
-
}
|
|
436
|
-
)
|
|
437
|
-
|
|
438
|
-
validated_unified_metadata = validate_metadata(unified_metadata)
|
|
439
|
-
|
|
440
|
-
return [ContentTypeEnum.IMAGE.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
def _construct_table_metadata(
|
|
444
|
-
table,
|
|
445
|
-
page_count,
|
|
446
|
-
page_idx,
|
|
447
|
-
block_idx,
|
|
448
|
-
source_metadata,
|
|
449
|
-
base_unified_metadata,
|
|
450
|
-
bbox,
|
|
451
|
-
):
|
|
452
|
-
content_metadata = {
|
|
453
|
-
"type": ContentTypeEnum.STRUCTURED,
|
|
454
|
-
"description": ContentDescriptionEnum.PDF_TABLE,
|
|
455
|
-
"page_number": page_idx,
|
|
456
|
-
"hierarchy": {
|
|
457
|
-
"page_count": page_count,
|
|
458
|
-
"page": page_idx,
|
|
459
|
-
"block": block_idx,
|
|
460
|
-
"line": -1,
|
|
461
|
-
"span": -1,
|
|
462
|
-
},
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
table_metadata = {
|
|
466
|
-
"caption": "",
|
|
467
|
-
"table_format": TableFormatEnum.MARKDOWN,
|
|
468
|
-
"table_location": bbox,
|
|
469
|
-
}
|
|
470
|
-
|
|
471
|
-
unified_metadata = base_unified_metadata.copy()
|
|
472
|
-
|
|
473
|
-
unified_metadata.update(
|
|
474
|
-
{
|
|
475
|
-
"content": table,
|
|
476
|
-
"source_metadata": source_metadata,
|
|
477
|
-
"content_metadata": content_metadata,
|
|
478
|
-
"table_metadata": table_metadata,
|
|
479
|
-
}
|
|
480
|
-
)
|
|
481
|
-
|
|
482
|
-
validated_unified_metadata = validate_metadata(unified_metadata)
|
|
483
|
-
|
|
484
|
-
return [ContentTypeEnum.STRUCTURED.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|