nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import io
|
|
8
|
+
import uuid
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from typing import Any
|
|
12
|
+
from typing import Dict
|
|
13
|
+
from typing import List
|
|
14
|
+
from typing import Optional
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import pypdfium2 as pdfium
|
|
19
|
+
from PIL import Image
|
|
20
|
+
from pypdfium2 import PdfImage
|
|
21
|
+
|
|
22
|
+
from nv_ingest_api.internal.enums.common import ContentDescriptionEnum, DocumentTypeEnum
|
|
23
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
24
|
+
from nv_ingest_api.internal.schemas.meta.metadata_schema import NearbyObjectsSchema
|
|
25
|
+
from nv_ingest_api.internal.enums.common import TableFormatEnum
|
|
26
|
+
from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
|
|
27
|
+
from nv_ingest_api.util.converters import datetools
|
|
28
|
+
from nv_ingest_api.util.detectors.language import detect_language
|
|
29
|
+
from nv_ingest_api.util.exception_handlers.pdf import pdfium_exception_handler
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class CroppedImageWithContent:
|
|
34
|
+
content: str
|
|
35
|
+
image: str
|
|
36
|
+
bbox: Tuple[int, int, int, int]
|
|
37
|
+
max_width: int
|
|
38
|
+
max_height: int
|
|
39
|
+
type_string: str
|
|
40
|
+
content_format: str = ""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class LatexTable:
|
|
45
|
+
latex: pd.DataFrame
|
|
46
|
+
bbox: Tuple[int, int, int, int]
|
|
47
|
+
max_width: int
|
|
48
|
+
max_height: int
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class Base64Image:
|
|
53
|
+
image: str
|
|
54
|
+
bbox: Tuple[int, int, int, int]
|
|
55
|
+
width: int
|
|
56
|
+
height: int
|
|
57
|
+
max_width: int
|
|
58
|
+
max_height: int
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class PDFMetadata:
|
|
63
|
+
"""
|
|
64
|
+
A data object to store metadata information extracted from a PDF document.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
page_count: int
|
|
68
|
+
filename: str
|
|
69
|
+
last_modified: str
|
|
70
|
+
date_created: str
|
|
71
|
+
keywords: List[str]
|
|
72
|
+
source_type: str = "PDF"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def extract_pdf_metadata(doc: pdfium.PdfDocument, source_id: str) -> PDFMetadata:
|
|
76
|
+
"""
|
|
77
|
+
Extracts metadata and relevant information from a PDF document.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
pdf_stream : bytes
|
|
82
|
+
The PDF document data as a byte stream.
|
|
83
|
+
source_id : str
|
|
84
|
+
The identifier for the source document, typically the filename.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
PDFMetadata
|
|
89
|
+
An object containing extracted metadata and information including:
|
|
90
|
+
- `page_count`: The total number of pages in the PDF.
|
|
91
|
+
- `filename`: The source filename or identifier.
|
|
92
|
+
- `last_modified`: The last modified date of the PDF document.
|
|
93
|
+
- `date_created`: The creation date of the PDF document.
|
|
94
|
+
- `keywords`: Keywords associated with the PDF document.
|
|
95
|
+
- `source_type`: The type/format of the source, e.g., "PDF".
|
|
96
|
+
|
|
97
|
+
Raises
|
|
98
|
+
------
|
|
99
|
+
PdfiumError
|
|
100
|
+
If there is an issue processing the PDF document.
|
|
101
|
+
"""
|
|
102
|
+
page_count: int = len(doc)
|
|
103
|
+
filename: str = source_id
|
|
104
|
+
|
|
105
|
+
# Extract document metadata
|
|
106
|
+
doc_meta = doc.get_metadata_dict()
|
|
107
|
+
|
|
108
|
+
# Extract and process the last modified date
|
|
109
|
+
last_modified: str = doc_meta.get("ModDate")
|
|
110
|
+
if last_modified in (None, ""):
|
|
111
|
+
last_modified = datetools.remove_tz(datetime.now()).isoformat()
|
|
112
|
+
else:
|
|
113
|
+
last_modified = datetools.datetimefrompdfmeta(last_modified)
|
|
114
|
+
|
|
115
|
+
# Extract and process the creation date
|
|
116
|
+
date_created: str = doc_meta.get("CreationDate")
|
|
117
|
+
if date_created in (None, ""):
|
|
118
|
+
date_created = datetools.remove_tz(datetime.now()).isoformat()
|
|
119
|
+
else:
|
|
120
|
+
date_created = datetools.datetimefrompdfmeta(date_created)
|
|
121
|
+
|
|
122
|
+
# Extract keywords, defaulting to an empty list if not found
|
|
123
|
+
keywords: List[str] = doc_meta.get("Keywords", [])
|
|
124
|
+
|
|
125
|
+
# Create the PDFMetadata object
|
|
126
|
+
metadata = PDFMetadata(
|
|
127
|
+
page_count=page_count,
|
|
128
|
+
filename=filename,
|
|
129
|
+
last_modified=last_modified,
|
|
130
|
+
date_created=date_created,
|
|
131
|
+
keywords=keywords,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
return metadata
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def construct_text_metadata(
|
|
138
|
+
accumulated_text,
|
|
139
|
+
keywords,
|
|
140
|
+
page_idx,
|
|
141
|
+
block_idx,
|
|
142
|
+
line_idx,
|
|
143
|
+
span_idx,
|
|
144
|
+
page_count,
|
|
145
|
+
text_depth,
|
|
146
|
+
source_metadata,
|
|
147
|
+
base_unified_metadata,
|
|
148
|
+
delimiter=" ",
|
|
149
|
+
bbox_max_dimensions: Tuple[int, int] = (-1, -1),
|
|
150
|
+
nearby_objects: Optional[Dict[str, Any]] = None,
|
|
151
|
+
):
|
|
152
|
+
extracted_text = delimiter.join(accumulated_text)
|
|
153
|
+
|
|
154
|
+
content_metadata = {
|
|
155
|
+
"type": ContentTypeEnum.TEXT,
|
|
156
|
+
"description": ContentDescriptionEnum.PDF_TEXT,
|
|
157
|
+
"page_number": page_idx,
|
|
158
|
+
"hierarchy": {
|
|
159
|
+
"page_count": page_count,
|
|
160
|
+
"page": page_idx,
|
|
161
|
+
"block": -1,
|
|
162
|
+
"line": -1,
|
|
163
|
+
"span": -1,
|
|
164
|
+
"nearby_objects": nearby_objects or NearbyObjectsSchema(),
|
|
165
|
+
},
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
language = detect_language(extracted_text)
|
|
169
|
+
|
|
170
|
+
# TODO(Devin) - Implement bounding box logic for text
|
|
171
|
+
bbox = (-1, -1, -1, -1)
|
|
172
|
+
|
|
173
|
+
text_metadata = {
|
|
174
|
+
"text_type": text_depth,
|
|
175
|
+
"summary": "",
|
|
176
|
+
"keywords": keywords,
|
|
177
|
+
"language": language,
|
|
178
|
+
"text_location": bbox,
|
|
179
|
+
"text_location_max_dimensions": bbox_max_dimensions,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
ext_unified_metadata = base_unified_metadata.copy()
|
|
183
|
+
|
|
184
|
+
ext_unified_metadata.update(
|
|
185
|
+
{
|
|
186
|
+
"content": extracted_text,
|
|
187
|
+
"source_metadata": source_metadata,
|
|
188
|
+
"content_metadata": content_metadata,
|
|
189
|
+
"text_metadata": text_metadata,
|
|
190
|
+
}
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
validated_unified_metadata = validate_metadata(ext_unified_metadata)
|
|
194
|
+
|
|
195
|
+
return [ContentTypeEnum.TEXT, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def construct_image_metadata_from_base64(
|
|
199
|
+
base64_image: str,
|
|
200
|
+
page_idx: int,
|
|
201
|
+
page_count: int,
|
|
202
|
+
source_metadata: Dict[str, Any],
|
|
203
|
+
base_unified_metadata: Dict[str, Any],
|
|
204
|
+
subtype: None | ContentTypeEnum | str = "",
|
|
205
|
+
text: str = "",
|
|
206
|
+
) -> List[Any]:
|
|
207
|
+
"""
|
|
208
|
+
Extracts image data from a base64-encoded image string, decodes the image to get
|
|
209
|
+
its dimensions and bounding box, and constructs metadata for the image.
|
|
210
|
+
|
|
211
|
+
Parameters
|
|
212
|
+
----------
|
|
213
|
+
base64_image : str
|
|
214
|
+
A base64-encoded string representing the image.
|
|
215
|
+
page_idx : int
|
|
216
|
+
The index of the current page being processed.
|
|
217
|
+
page_count : int
|
|
218
|
+
The total number of pages in the PDF document.
|
|
219
|
+
source_metadata : Dict[str, Any]
|
|
220
|
+
Metadata related to the source of the PDF document.
|
|
221
|
+
base_unified_metadata : Dict[str, Any]
|
|
222
|
+
The base unified metadata structure to be updated with the extracted image information.
|
|
223
|
+
|
|
224
|
+
Returns
|
|
225
|
+
-------
|
|
226
|
+
List[Any]
|
|
227
|
+
A list containing the content type, validated metadata dictionary, and a UUID string.
|
|
228
|
+
|
|
229
|
+
Raises
|
|
230
|
+
------
|
|
231
|
+
ValueError
|
|
232
|
+
If the image cannot be decoded from the base64 string.
|
|
233
|
+
"""
|
|
234
|
+
# Decode the base64 image
|
|
235
|
+
try:
|
|
236
|
+
image_data = base64.b64decode(base64_image)
|
|
237
|
+
image = Image.open(io.BytesIO(image_data))
|
|
238
|
+
except Exception as e:
|
|
239
|
+
raise ValueError(f"Failed to decode image from base64: {e}")
|
|
240
|
+
|
|
241
|
+
# Extract image dimensions and bounding box
|
|
242
|
+
width, height = image.size
|
|
243
|
+
bbox = (0, 0, width, height) # Assuming the full image as the bounding box
|
|
244
|
+
|
|
245
|
+
# Construct content metadata
|
|
246
|
+
content_metadata: Dict[str, Any] = {
|
|
247
|
+
"type": ContentTypeEnum.IMAGE,
|
|
248
|
+
"description": ContentDescriptionEnum.PDF_IMAGE,
|
|
249
|
+
"page_number": page_idx,
|
|
250
|
+
"hierarchy": {
|
|
251
|
+
"page_count": page_count,
|
|
252
|
+
"page": page_idx,
|
|
253
|
+
"block": -1,
|
|
254
|
+
"line": -1,
|
|
255
|
+
"span": -1,
|
|
256
|
+
},
|
|
257
|
+
"subtype": subtype or "",
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
# Construct image metadata
|
|
261
|
+
image_metadata: Dict[str, Any] = {
|
|
262
|
+
"image_type": DocumentTypeEnum.PNG,
|
|
263
|
+
"structured_image_type": ContentTypeEnum.UNKNOWN,
|
|
264
|
+
"caption": "",
|
|
265
|
+
"text": text,
|
|
266
|
+
"image_location": bbox,
|
|
267
|
+
"image_location_max_dimensions": (width, height),
|
|
268
|
+
"height": height,
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
# Update the unified metadata with the extracted image information
|
|
272
|
+
unified_metadata: Dict[str, Any] = base_unified_metadata.copy()
|
|
273
|
+
unified_metadata.update(
|
|
274
|
+
{
|
|
275
|
+
"content": base64_image,
|
|
276
|
+
"source_metadata": source_metadata,
|
|
277
|
+
"content_metadata": content_metadata,
|
|
278
|
+
"image_metadata": image_metadata,
|
|
279
|
+
}
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
# Validate and return the unified metadata
|
|
283
|
+
validated_unified_metadata = validate_metadata(unified_metadata)
|
|
284
|
+
return [ContentTypeEnum.IMAGE, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def construct_image_metadata_from_pdf_image(
|
|
288
|
+
pdf_image: PdfImage,
|
|
289
|
+
page_idx: int,
|
|
290
|
+
page_count: int,
|
|
291
|
+
source_metadata: Dict[str, Any],
|
|
292
|
+
base_unified_metadata: Dict[str, Any],
|
|
293
|
+
) -> List[Any]:
|
|
294
|
+
"""
|
|
295
|
+
Extracts image data from a PdfImage object, converts it to a base64-encoded string,
|
|
296
|
+
and constructs metadata for the image.
|
|
297
|
+
|
|
298
|
+
Parameters
|
|
299
|
+
----------
|
|
300
|
+
image_obj : PdfImage
|
|
301
|
+
The PdfImage object from which the image will be extracted.
|
|
302
|
+
page_idx : int
|
|
303
|
+
The index of the current page being processed.
|
|
304
|
+
page_count : int
|
|
305
|
+
The total number of pages in the PDF document.
|
|
306
|
+
source_metadata : dict
|
|
307
|
+
Metadata related to the source of the PDF document.
|
|
308
|
+
base_unified_metadata : dict
|
|
309
|
+
The base unified metadata structure to be updated with the extracted image information.
|
|
310
|
+
|
|
311
|
+
Returns
|
|
312
|
+
-------
|
|
313
|
+
List[Any]
|
|
314
|
+
A list containing the content type, validated metadata dictionary, and a UUID string.
|
|
315
|
+
|
|
316
|
+
Raises
|
|
317
|
+
------
|
|
318
|
+
PdfiumError
|
|
319
|
+
If the image cannot be extracted due to an issue with the PdfImage object.
|
|
320
|
+
:param pdf_image:
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
# Construct content metadata
|
|
324
|
+
content_metadata: Dict[str, Any] = {
|
|
325
|
+
"type": ContentTypeEnum.IMAGE,
|
|
326
|
+
"description": ContentDescriptionEnum.PDF_IMAGE,
|
|
327
|
+
"page_number": page_idx,
|
|
328
|
+
"hierarchy": {
|
|
329
|
+
"page_count": page_count,
|
|
330
|
+
"page": page_idx,
|
|
331
|
+
"block": -1,
|
|
332
|
+
"line": -1,
|
|
333
|
+
"span": -1,
|
|
334
|
+
},
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
# Construct image metadata
|
|
338
|
+
image_metadata: Dict[str, Any] = {
|
|
339
|
+
"image_type": DocumentTypeEnum.PNG,
|
|
340
|
+
"structured_image_type": ContentTypeEnum.UNKNOWN,
|
|
341
|
+
"caption": "",
|
|
342
|
+
"text": "",
|
|
343
|
+
"image_location": pdf_image.bbox,
|
|
344
|
+
"image_location_max_dimensions": (max(pdf_image.max_width, 0), max(pdf_image.max_height, 0)),
|
|
345
|
+
"height": pdf_image.height,
|
|
346
|
+
"width": pdf_image.width,
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
# Update the unified metadata with the extracted image information
|
|
350
|
+
unified_metadata: Dict[str, Any] = base_unified_metadata.copy()
|
|
351
|
+
unified_metadata.update(
|
|
352
|
+
{
|
|
353
|
+
"content": pdf_image.image,
|
|
354
|
+
"source_metadata": source_metadata,
|
|
355
|
+
"content_metadata": content_metadata,
|
|
356
|
+
"image_metadata": image_metadata,
|
|
357
|
+
}
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Validate and return the unified metadata
|
|
361
|
+
validated_unified_metadata = validate_metadata(unified_metadata)
|
|
362
|
+
return [ContentTypeEnum.IMAGE, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _construct_text_image_primitive(
|
|
366
|
+
cropped_image: CroppedImageWithContent,
|
|
367
|
+
page_idx: int,
|
|
368
|
+
page_count: int,
|
|
369
|
+
source_metadata: Dict,
|
|
370
|
+
base_unified_metadata: Dict,
|
|
371
|
+
) -> List[Any]:
|
|
372
|
+
"""Constructs an 'image' primitive for a detected text block, intended for downstream OCR."""
|
|
373
|
+
content_metadata = {
|
|
374
|
+
"type": ContentTypeEnum.TEXT,
|
|
375
|
+
"description": ContentDescriptionEnum.PDF_TEXT,
|
|
376
|
+
"page_number": page_idx,
|
|
377
|
+
"hierarchy": {
|
|
378
|
+
"page_count": page_count,
|
|
379
|
+
"page": page_idx,
|
|
380
|
+
},
|
|
381
|
+
"subtype": cropped_image.type_string,
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
text_metadata = {
|
|
385
|
+
"text_type": "page",
|
|
386
|
+
"text_location": cropped_image.bbox,
|
|
387
|
+
"text_location_max_dimensions": (cropped_image.max_width, cropped_image.max_height),
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
unified_metadata = base_unified_metadata.copy()
|
|
391
|
+
unified_metadata.update(
|
|
392
|
+
{
|
|
393
|
+
"content": cropped_image.image, # The base64 image of the text block
|
|
394
|
+
"source_metadata": source_metadata,
|
|
395
|
+
"content_metadata": content_metadata,
|
|
396
|
+
"text_metadata": text_metadata,
|
|
397
|
+
}
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
validated_metadata = validate_metadata(unified_metadata)
|
|
401
|
+
return [ContentTypeEnum.TEXT, validated_metadata.model_dump(), str(uuid.uuid4())]
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
# TODO(Devin): Disambiguate tables and charts, create two distinct processing methods
|
|
405
|
+
@pdfium_exception_handler(descriptor="pdfium")
|
|
406
|
+
def construct_page_element_metadata(
|
|
407
|
+
structured_image: CroppedImageWithContent,
|
|
408
|
+
page_idx: int,
|
|
409
|
+
page_count: int,
|
|
410
|
+
source_metadata: Dict,
|
|
411
|
+
base_unified_metadata: Dict,
|
|
412
|
+
):
|
|
413
|
+
"""
|
|
414
|
+
+--------------------------------+--------------------------+------------+---+
|
|
415
|
+
| Table/Chart Metadata | | Extracted | Y |
|
|
416
|
+
| (tables within documents) | | | |
|
|
417
|
+
+--------------------------------+--------------------------+------------+---+
|
|
418
|
+
| Table format | Structured (dataframe / | Extracted | |
|
|
419
|
+
| | lists of rows and | | |
|
|
420
|
+
| | columns), or serialized | | |
|
|
421
|
+
| | as markdown, html, | | |
|
|
422
|
+
| | latex, simple (cells | | |
|
|
423
|
+
| | separated just as spaces)| | |
|
|
424
|
+
+--------------------------------+--------------------------+------------+---+
|
|
425
|
+
| Table content | Extracted text content | | |
|
|
426
|
+
| | | | |
|
|
427
|
+
| | Important: Tables should | | |
|
|
428
|
+
| | not be chunked | | |
|
|
429
|
+
+--------------------------------+--------------------------+------------+---+
|
|
430
|
+
| Table location | Bounding box of the table| | |
|
|
431
|
+
+--------------------------------+--------------------------+------------+---+
|
|
432
|
+
| Caption | Detected captions for | | |
|
|
433
|
+
| | the table/chart | | |
|
|
434
|
+
+--------------------------------+--------------------------+------------+---+
|
|
435
|
+
| uploaded_image_uri | Mirrors | | |
|
|
436
|
+
| | source_metadata. | | |
|
|
437
|
+
| | source_location | | |
|
|
438
|
+
+--------------------------------+--------------------------+------------+---+
|
|
439
|
+
"""
|
|
440
|
+
text_types = {"paragraph", "title", "header_footer"}
|
|
441
|
+
if structured_image.type_string in text_types:
|
|
442
|
+
return _construct_text_image_primitive(
|
|
443
|
+
structured_image, page_idx, page_count, source_metadata, base_unified_metadata
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
if structured_image.type_string in ("table",):
|
|
447
|
+
content = structured_image.image
|
|
448
|
+
structured_content_text = structured_image.content
|
|
449
|
+
structured_content_format = structured_image.content_format
|
|
450
|
+
table_format = TableFormatEnum.IMAGE
|
|
451
|
+
subtype = ContentTypeEnum.TABLE
|
|
452
|
+
description = ContentDescriptionEnum.PDF_TABLE
|
|
453
|
+
meta_name = "table_metadata"
|
|
454
|
+
|
|
455
|
+
elif structured_image.type_string in ("chart",):
|
|
456
|
+
content = structured_image.image
|
|
457
|
+
structured_content_text = structured_image.content
|
|
458
|
+
structured_content_format = structured_image.content_format
|
|
459
|
+
table_format = TableFormatEnum.IMAGE
|
|
460
|
+
subtype = ContentTypeEnum.CHART
|
|
461
|
+
description = ContentDescriptionEnum.PDF_CHART
|
|
462
|
+
# TODO(Devin) swap this to chart_metadata after we confirm metadata schema changes.
|
|
463
|
+
meta_name = "table_metadata"
|
|
464
|
+
|
|
465
|
+
elif structured_image.type_string in ("infographic",):
|
|
466
|
+
content = structured_image.image
|
|
467
|
+
structured_content_text = structured_image.content
|
|
468
|
+
structured_content_format = structured_image.content_format
|
|
469
|
+
table_format = TableFormatEnum.IMAGE
|
|
470
|
+
subtype = ContentTypeEnum.INFOGRAPHIC
|
|
471
|
+
description = ContentDescriptionEnum.PDF_INFOGRAPHIC
|
|
472
|
+
meta_name = "table_metadata"
|
|
473
|
+
|
|
474
|
+
else:
|
|
475
|
+
raise ValueError(f"Unknown table/chart/infographic type: {structured_image.type_string}")
|
|
476
|
+
|
|
477
|
+
content_metadata = {
|
|
478
|
+
"type": ContentTypeEnum.STRUCTURED,
|
|
479
|
+
"description": description,
|
|
480
|
+
"page_number": page_idx,
|
|
481
|
+
"hierarchy": {
|
|
482
|
+
"page_count": page_count,
|
|
483
|
+
"page": page_idx,
|
|
484
|
+
"line": -1,
|
|
485
|
+
"span": -1,
|
|
486
|
+
},
|
|
487
|
+
"subtype": subtype,
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
structured_metadata = {
|
|
491
|
+
"caption": "",
|
|
492
|
+
"table_format": table_format,
|
|
493
|
+
"table_content": structured_content_text,
|
|
494
|
+
"table_content_format": structured_content_format,
|
|
495
|
+
"table_location": structured_image.bbox,
|
|
496
|
+
"table_location_max_dimensions": (structured_image.max_width, structured_image.max_height),
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
ext_unified_metadata = base_unified_metadata.copy()
|
|
500
|
+
|
|
501
|
+
ext_unified_metadata.update(
|
|
502
|
+
{
|
|
503
|
+
"content": content,
|
|
504
|
+
"source_metadata": source_metadata,
|
|
505
|
+
"content_metadata": content_metadata,
|
|
506
|
+
meta_name: structured_metadata,
|
|
507
|
+
}
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
validated_unified_metadata = validate_metadata(ext_unified_metadata)
|
|
511
|
+
|
|
512
|
+
return [ContentTypeEnum.STRUCTURED, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
# TODO: remove this alias
|
|
516
|
+
construct_table_and_chart_metadata = construct_page_element_metadata
|