nv-ingest-api 2025.4.16.dev20250416__py3-none-any.whl → 2025.4.17.dev20250417__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +435 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +72 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +334 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +398 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.17.dev20250417.dist-info/RECORD +152 -0
- nv_ingest_api-2025.4.16.dev20250416.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.16.dev20250416.dist-info → nv_ingest_api-2025.4.17.dev20250417.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import io
|
|
8
|
+
import uuid
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from typing import Any
|
|
12
|
+
from typing import Dict
|
|
13
|
+
from typing import List
|
|
14
|
+
from typing import Optional
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import pypdfium2 as pdfium
|
|
19
|
+
from PIL import Image
|
|
20
|
+
from pypdfium2 import PdfImage
|
|
21
|
+
|
|
22
|
+
from nv_ingest_api.internal.enums.common import ContentDescriptionEnum, DocumentTypeEnum
|
|
23
|
+
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
24
|
+
from nv_ingest_api.internal.schemas.meta.metadata_schema import NearbyObjectsSchema
|
|
25
|
+
from nv_ingest_api.internal.enums.common import TableFormatEnum
|
|
26
|
+
from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
|
|
27
|
+
from nv_ingest_api.util.converters import datetools
|
|
28
|
+
from nv_ingest_api.util.detectors.language import detect_language
|
|
29
|
+
from nv_ingest_api.util.exception_handlers.pdf import pdfium_exception_handler
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class CroppedImageWithContent:
|
|
34
|
+
content: str
|
|
35
|
+
image: str
|
|
36
|
+
bbox: Tuple[int, int, int, int]
|
|
37
|
+
max_width: int
|
|
38
|
+
max_height: int
|
|
39
|
+
type_string: str
|
|
40
|
+
content_format: str = ""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class LatexTable:
|
|
45
|
+
latex: pd.DataFrame
|
|
46
|
+
bbox: Tuple[int, int, int, int]
|
|
47
|
+
max_width: int
|
|
48
|
+
max_height: int
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class Base64Image:
|
|
53
|
+
image: str
|
|
54
|
+
bbox: Tuple[int, int, int, int]
|
|
55
|
+
width: int
|
|
56
|
+
height: int
|
|
57
|
+
max_width: int
|
|
58
|
+
max_height: int
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class PDFMetadata:
|
|
63
|
+
"""
|
|
64
|
+
A data object to store metadata information extracted from a PDF document.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
page_count: int
|
|
68
|
+
filename: str
|
|
69
|
+
last_modified: str
|
|
70
|
+
date_created: str
|
|
71
|
+
keywords: List[str]
|
|
72
|
+
source_type: str = "PDF"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def extract_pdf_metadata(doc: pdfium.PdfDocument, source_id: str) -> PDFMetadata:
|
|
76
|
+
"""
|
|
77
|
+
Extracts metadata and relevant information from a PDF document.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
pdf_stream : bytes
|
|
82
|
+
The PDF document data as a byte stream.
|
|
83
|
+
source_id : str
|
|
84
|
+
The identifier for the source document, typically the filename.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
PDFMetadata
|
|
89
|
+
An object containing extracted metadata and information including:
|
|
90
|
+
- `page_count`: The total number of pages in the PDF.
|
|
91
|
+
- `filename`: The source filename or identifier.
|
|
92
|
+
- `last_modified`: The last modified date of the PDF document.
|
|
93
|
+
- `date_created`: The creation date of the PDF document.
|
|
94
|
+
- `keywords`: Keywords associated with the PDF document.
|
|
95
|
+
- `source_type`: The type/format of the source, e.g., "PDF".
|
|
96
|
+
|
|
97
|
+
Raises
|
|
98
|
+
------
|
|
99
|
+
PdfiumError
|
|
100
|
+
If there is an issue processing the PDF document.
|
|
101
|
+
"""
|
|
102
|
+
page_count: int = len(doc)
|
|
103
|
+
filename: str = source_id
|
|
104
|
+
|
|
105
|
+
# Extract document metadata
|
|
106
|
+
doc_meta = doc.get_metadata_dict()
|
|
107
|
+
|
|
108
|
+
# Extract and process the last modified date
|
|
109
|
+
last_modified: str = doc_meta.get("ModDate")
|
|
110
|
+
if last_modified in (None, ""):
|
|
111
|
+
last_modified = datetools.remove_tz(datetime.now()).isoformat()
|
|
112
|
+
else:
|
|
113
|
+
last_modified = datetools.datetimefrompdfmeta(last_modified)
|
|
114
|
+
|
|
115
|
+
# Extract and process the creation date
|
|
116
|
+
date_created: str = doc_meta.get("CreationDate")
|
|
117
|
+
if date_created in (None, ""):
|
|
118
|
+
date_created = datetools.remove_tz(datetime.now()).isoformat()
|
|
119
|
+
else:
|
|
120
|
+
date_created = datetools.datetimefrompdfmeta(date_created)
|
|
121
|
+
|
|
122
|
+
# Extract keywords, defaulting to an empty list if not found
|
|
123
|
+
keywords: List[str] = doc_meta.get("Keywords", [])
|
|
124
|
+
|
|
125
|
+
# Create the PDFMetadata object
|
|
126
|
+
metadata = PDFMetadata(
|
|
127
|
+
page_count=page_count,
|
|
128
|
+
filename=filename,
|
|
129
|
+
last_modified=last_modified,
|
|
130
|
+
date_created=date_created,
|
|
131
|
+
keywords=keywords,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
return metadata
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def construct_text_metadata(
|
|
138
|
+
accumulated_text,
|
|
139
|
+
keywords,
|
|
140
|
+
page_idx,
|
|
141
|
+
block_idx,
|
|
142
|
+
line_idx,
|
|
143
|
+
span_idx,
|
|
144
|
+
page_count,
|
|
145
|
+
text_depth,
|
|
146
|
+
source_metadata,
|
|
147
|
+
base_unified_metadata,
|
|
148
|
+
delimiter=" ",
|
|
149
|
+
bbox_max_dimensions: Tuple[int, int] = (-1, -1),
|
|
150
|
+
nearby_objects: Optional[Dict[str, Any]] = None,
|
|
151
|
+
):
|
|
152
|
+
extracted_text = delimiter.join(accumulated_text)
|
|
153
|
+
|
|
154
|
+
content_metadata = {
|
|
155
|
+
"type": ContentTypeEnum.TEXT,
|
|
156
|
+
"description": ContentDescriptionEnum.PDF_TEXT,
|
|
157
|
+
"page_number": page_idx,
|
|
158
|
+
"hierarchy": {
|
|
159
|
+
"page_count": page_count,
|
|
160
|
+
"page": page_idx,
|
|
161
|
+
"block": -1,
|
|
162
|
+
"line": -1,
|
|
163
|
+
"span": -1,
|
|
164
|
+
"nearby_objects": nearby_objects or NearbyObjectsSchema(),
|
|
165
|
+
},
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
language = detect_language(extracted_text)
|
|
169
|
+
|
|
170
|
+
# TODO(Devin) - Implement bounding box logic for text
|
|
171
|
+
bbox = (-1, -1, -1, -1)
|
|
172
|
+
|
|
173
|
+
text_metadata = {
|
|
174
|
+
"text_type": text_depth,
|
|
175
|
+
"summary": "",
|
|
176
|
+
"keywords": keywords,
|
|
177
|
+
"language": language,
|
|
178
|
+
"text_location": bbox,
|
|
179
|
+
"text_location_max_dimensions": bbox_max_dimensions,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
ext_unified_metadata = base_unified_metadata.copy()
|
|
183
|
+
|
|
184
|
+
ext_unified_metadata.update(
|
|
185
|
+
{
|
|
186
|
+
"content": extracted_text,
|
|
187
|
+
"source_metadata": source_metadata,
|
|
188
|
+
"content_metadata": content_metadata,
|
|
189
|
+
"text_metadata": text_metadata,
|
|
190
|
+
}
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
validated_unified_metadata = validate_metadata(ext_unified_metadata)
|
|
194
|
+
|
|
195
|
+
return [ContentTypeEnum.TEXT, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def construct_image_metadata_from_base64(
|
|
199
|
+
base64_image: str,
|
|
200
|
+
page_idx: int,
|
|
201
|
+
page_count: int,
|
|
202
|
+
source_metadata: Dict[str, Any],
|
|
203
|
+
base_unified_metadata: Dict[str, Any],
|
|
204
|
+
) -> List[Any]:
|
|
205
|
+
"""
|
|
206
|
+
Extracts image data from a base64-encoded image string, decodes the image to get
|
|
207
|
+
its dimensions and bounding box, and constructs metadata for the image.
|
|
208
|
+
|
|
209
|
+
Parameters
|
|
210
|
+
----------
|
|
211
|
+
base64_image : str
|
|
212
|
+
A base64-encoded string representing the image.
|
|
213
|
+
page_idx : int
|
|
214
|
+
The index of the current page being processed.
|
|
215
|
+
page_count : int
|
|
216
|
+
The total number of pages in the PDF document.
|
|
217
|
+
source_metadata : Dict[str, Any]
|
|
218
|
+
Metadata related to the source of the PDF document.
|
|
219
|
+
base_unified_metadata : Dict[str, Any]
|
|
220
|
+
The base unified metadata structure to be updated with the extracted image information.
|
|
221
|
+
|
|
222
|
+
Returns
|
|
223
|
+
-------
|
|
224
|
+
List[Any]
|
|
225
|
+
A list containing the content type, validated metadata dictionary, and a UUID string.
|
|
226
|
+
|
|
227
|
+
Raises
|
|
228
|
+
------
|
|
229
|
+
ValueError
|
|
230
|
+
If the image cannot be decoded from the base64 string.
|
|
231
|
+
"""
|
|
232
|
+
# Decode the base64 image
|
|
233
|
+
try:
|
|
234
|
+
image_data = base64.b64decode(base64_image)
|
|
235
|
+
image = Image.open(io.BytesIO(image_data))
|
|
236
|
+
except Exception as e:
|
|
237
|
+
raise ValueError(f"Failed to decode image from base64: {e}")
|
|
238
|
+
|
|
239
|
+
# Extract image dimensions and bounding box
|
|
240
|
+
width, height = image.size
|
|
241
|
+
bbox = (0, 0, width, height) # Assuming the full image as the bounding box
|
|
242
|
+
|
|
243
|
+
# Construct content metadata
|
|
244
|
+
content_metadata: Dict[str, Any] = {
|
|
245
|
+
"type": ContentTypeEnum.IMAGE,
|
|
246
|
+
"description": ContentDescriptionEnum.PDF_IMAGE,
|
|
247
|
+
"page_number": page_idx,
|
|
248
|
+
"hierarchy": {
|
|
249
|
+
"page_count": page_count,
|
|
250
|
+
"page": page_idx,
|
|
251
|
+
"block": -1,
|
|
252
|
+
"line": -1,
|
|
253
|
+
"span": -1,
|
|
254
|
+
},
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
# Construct image metadata
|
|
258
|
+
image_metadata: Dict[str, Any] = {
|
|
259
|
+
"image_type": DocumentTypeEnum.PNG,
|
|
260
|
+
"structured_image_type": ContentTypeEnum.UNKNOWN,
|
|
261
|
+
"caption": "",
|
|
262
|
+
"text": "",
|
|
263
|
+
"image_location": bbox,
|
|
264
|
+
"image_location_max_dimensions": (width, height),
|
|
265
|
+
"height": height,
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
# Update the unified metadata with the extracted image information
|
|
269
|
+
unified_metadata: Dict[str, Any] = base_unified_metadata.copy()
|
|
270
|
+
unified_metadata.update(
|
|
271
|
+
{
|
|
272
|
+
"content": base64_image,
|
|
273
|
+
"source_metadata": source_metadata,
|
|
274
|
+
"content_metadata": content_metadata,
|
|
275
|
+
"image_metadata": image_metadata,
|
|
276
|
+
}
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Validate and return the unified metadata
|
|
280
|
+
validated_unified_metadata = validate_metadata(unified_metadata)
|
|
281
|
+
return [ContentTypeEnum.IMAGE, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def construct_image_metadata_from_pdf_image(
|
|
285
|
+
pdf_image: PdfImage,
|
|
286
|
+
page_idx: int,
|
|
287
|
+
page_count: int,
|
|
288
|
+
source_metadata: Dict[str, Any],
|
|
289
|
+
base_unified_metadata: Dict[str, Any],
|
|
290
|
+
) -> List[Any]:
|
|
291
|
+
"""
|
|
292
|
+
Extracts image data from a PdfImage object, converts it to a base64-encoded string,
|
|
293
|
+
and constructs metadata for the image.
|
|
294
|
+
|
|
295
|
+
Parameters
|
|
296
|
+
----------
|
|
297
|
+
image_obj : PdfImage
|
|
298
|
+
The PdfImage object from which the image will be extracted.
|
|
299
|
+
page_idx : int
|
|
300
|
+
The index of the current page being processed.
|
|
301
|
+
page_count : int
|
|
302
|
+
The total number of pages in the PDF document.
|
|
303
|
+
source_metadata : dict
|
|
304
|
+
Metadata related to the source of the PDF document.
|
|
305
|
+
base_unified_metadata : dict
|
|
306
|
+
The base unified metadata structure to be updated with the extracted image information.
|
|
307
|
+
|
|
308
|
+
Returns
|
|
309
|
+
-------
|
|
310
|
+
List[Any]
|
|
311
|
+
A list containing the content type, validated metadata dictionary, and a UUID string.
|
|
312
|
+
|
|
313
|
+
Raises
|
|
314
|
+
------
|
|
315
|
+
PdfiumError
|
|
316
|
+
If the image cannot be extracted due to an issue with the PdfImage object.
|
|
317
|
+
:param pdf_image:
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
# Construct content metadata
|
|
321
|
+
content_metadata: Dict[str, Any] = {
|
|
322
|
+
"type": ContentTypeEnum.IMAGE,
|
|
323
|
+
"description": ContentDescriptionEnum.PDF_IMAGE,
|
|
324
|
+
"page_number": page_idx,
|
|
325
|
+
"hierarchy": {
|
|
326
|
+
"page_count": page_count,
|
|
327
|
+
"page": page_idx,
|
|
328
|
+
"block": -1,
|
|
329
|
+
"line": -1,
|
|
330
|
+
"span": -1,
|
|
331
|
+
},
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
# Construct image metadata
|
|
335
|
+
image_metadata: Dict[str, Any] = {
|
|
336
|
+
"image_type": DocumentTypeEnum.PNG,
|
|
337
|
+
"structured_image_type": ContentTypeEnum.UNKNOWN,
|
|
338
|
+
"caption": "",
|
|
339
|
+
"text": "",
|
|
340
|
+
"image_location": pdf_image.bbox,
|
|
341
|
+
"image_location_max_dimensions": (max(pdf_image.max_width, 0), max(pdf_image.max_height, 0)),
|
|
342
|
+
"height": pdf_image.height,
|
|
343
|
+
"width": pdf_image.width,
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
# Update the unified metadata with the extracted image information
|
|
347
|
+
unified_metadata: Dict[str, Any] = base_unified_metadata.copy()
|
|
348
|
+
unified_metadata.update(
|
|
349
|
+
{
|
|
350
|
+
"content": pdf_image.image,
|
|
351
|
+
"source_metadata": source_metadata,
|
|
352
|
+
"content_metadata": content_metadata,
|
|
353
|
+
"image_metadata": image_metadata,
|
|
354
|
+
}
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Validate and return the unified metadata
|
|
358
|
+
validated_unified_metadata = validate_metadata(unified_metadata)
|
|
359
|
+
return [ContentTypeEnum.IMAGE, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
# TODO(Devin): Disambiguate tables and charts, create two distinct processing methods
|
|
363
|
+
@pdfium_exception_handler(descriptor="pdfium")
|
|
364
|
+
def construct_page_element_metadata(
|
|
365
|
+
structured_image: CroppedImageWithContent,
|
|
366
|
+
page_idx: int,
|
|
367
|
+
page_count: int,
|
|
368
|
+
source_metadata: Dict,
|
|
369
|
+
base_unified_metadata: Dict,
|
|
370
|
+
):
|
|
371
|
+
"""
|
|
372
|
+
+--------------------------------+--------------------------+------------+---+
|
|
373
|
+
| Table/Chart Metadata | | Extracted | Y |
|
|
374
|
+
| (tables within documents) | | | |
|
|
375
|
+
+--------------------------------+--------------------------+------------+---+
|
|
376
|
+
| Table format | Structured (dataframe / | Extracted | |
|
|
377
|
+
| | lists of rows and | | |
|
|
378
|
+
| | columns), or serialized | | |
|
|
379
|
+
| | as markdown, html, | | |
|
|
380
|
+
| | latex, simple (cells | | |
|
|
381
|
+
| | separated just as spaces)| | |
|
|
382
|
+
+--------------------------------+--------------------------+------------+---+
|
|
383
|
+
| Table content | Extracted text content | | |
|
|
384
|
+
| | | | |
|
|
385
|
+
| | Important: Tables should | | |
|
|
386
|
+
| | not be chunked | | |
|
|
387
|
+
+--------------------------------+--------------------------+------------+---+
|
|
388
|
+
| Table location | Bounding box of the table| | |
|
|
389
|
+
+--------------------------------+--------------------------+------------+---+
|
|
390
|
+
| Caption | Detected captions for | | |
|
|
391
|
+
| | the table/chart | | |
|
|
392
|
+
+--------------------------------+--------------------------+------------+---+
|
|
393
|
+
| uploaded_image_uri | Mirrors | | |
|
|
394
|
+
| | source_metadata. | | |
|
|
395
|
+
| | source_location | | |
|
|
396
|
+
+--------------------------------+--------------------------+------------+---+
|
|
397
|
+
"""
|
|
398
|
+
|
|
399
|
+
if structured_image.type_string in ("table",):
|
|
400
|
+
content = structured_image.image
|
|
401
|
+
structured_content_text = structured_image.content
|
|
402
|
+
structured_content_format = structured_image.content_format
|
|
403
|
+
table_format = TableFormatEnum.IMAGE
|
|
404
|
+
subtype = ContentTypeEnum.TABLE
|
|
405
|
+
description = ContentDescriptionEnum.PDF_TABLE
|
|
406
|
+
meta_name = "table_metadata"
|
|
407
|
+
|
|
408
|
+
elif structured_image.type_string in ("chart",):
|
|
409
|
+
content = structured_image.image
|
|
410
|
+
structured_content_text = structured_image.content
|
|
411
|
+
structured_content_format = structured_image.content_format
|
|
412
|
+
table_format = TableFormatEnum.IMAGE
|
|
413
|
+
subtype = ContentTypeEnum.CHART
|
|
414
|
+
description = ContentDescriptionEnum.PDF_CHART
|
|
415
|
+
# TODO(Devin) swap this to chart_metadata after we confirm metadata schema changes.
|
|
416
|
+
meta_name = "table_metadata"
|
|
417
|
+
|
|
418
|
+
elif structured_image.type_string in ("infographic",):
|
|
419
|
+
content = structured_image.image
|
|
420
|
+
structured_content_text = structured_image.content
|
|
421
|
+
structured_content_format = structured_image.content_format
|
|
422
|
+
table_format = TableFormatEnum.IMAGE
|
|
423
|
+
subtype = ContentTypeEnum.INFOGRAPHIC
|
|
424
|
+
description = ContentDescriptionEnum.PDF_INFOGRAPHIC
|
|
425
|
+
meta_name = "table_metadata"
|
|
426
|
+
|
|
427
|
+
else:
|
|
428
|
+
raise ValueError(f"Unknown table/chart/infographic type: {structured_image.type_string}")
|
|
429
|
+
|
|
430
|
+
content_metadata = {
|
|
431
|
+
"type": ContentTypeEnum.STRUCTURED,
|
|
432
|
+
"description": description,
|
|
433
|
+
"page_number": page_idx,
|
|
434
|
+
"hierarchy": {
|
|
435
|
+
"page_count": page_count,
|
|
436
|
+
"page": page_idx,
|
|
437
|
+
"line": -1,
|
|
438
|
+
"span": -1,
|
|
439
|
+
},
|
|
440
|
+
"subtype": subtype,
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
structured_metadata = {
|
|
444
|
+
"caption": "",
|
|
445
|
+
"table_format": table_format,
|
|
446
|
+
"table_content": structured_content_text,
|
|
447
|
+
"table_content_format": structured_content_format,
|
|
448
|
+
"table_location": structured_image.bbox,
|
|
449
|
+
"table_location_max_dimensions": (structured_image.max_width, structured_image.max_height),
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
ext_unified_metadata = base_unified_metadata.copy()
|
|
453
|
+
|
|
454
|
+
ext_unified_metadata.update(
|
|
455
|
+
{
|
|
456
|
+
"content": content,
|
|
457
|
+
"source_metadata": source_metadata,
|
|
458
|
+
"content_metadata": content_metadata,
|
|
459
|
+
meta_name: structured_metadata,
|
|
460
|
+
}
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
validated_unified_metadata = validate_metadata(ext_unified_metadata)
|
|
464
|
+
|
|
465
|
+
return [ContentTypeEnum.STRUCTURED, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
# TODO: remove this alias
|
|
469
|
+
construct_table_and_chart_metadata = construct_page_element_metadata
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import math
|
|
8
|
+
import multiprocessing as mp
|
|
9
|
+
import os
|
|
10
|
+
from threading import Lock
|
|
11
|
+
from typing import Any, Callable, Optional
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SimpleFuture:
|
|
17
|
+
"""
|
|
18
|
+
A simplified future object that uses a multiprocessing Pipe to receive its result.
|
|
19
|
+
|
|
20
|
+
When the result() method is called, it blocks until the worker sends a tuple
|
|
21
|
+
(result, error) over the pipe.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, parent_conn: mp.connection.Connection) -> None:
|
|
25
|
+
"""
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
parent_conn : mp.connection.Connection
|
|
29
|
+
The parent end of the multiprocessing Pipe used to receive the result.
|
|
30
|
+
"""
|
|
31
|
+
self._parent_conn: mp.connection.Connection = parent_conn
|
|
32
|
+
|
|
33
|
+
def result(self) -> Any:
|
|
34
|
+
"""
|
|
35
|
+
Retrieve the result from the future, blocking until it is available.
|
|
36
|
+
|
|
37
|
+
Returns
|
|
38
|
+
-------
|
|
39
|
+
Any
|
|
40
|
+
The result returned by the worker function.
|
|
41
|
+
|
|
42
|
+
Raises
|
|
43
|
+
------
|
|
44
|
+
Exception
|
|
45
|
+
If the worker function raised an exception, it is re-raised here.
|
|
46
|
+
"""
|
|
47
|
+
result, error = self._parent_conn.recv()
|
|
48
|
+
if error is not None:
|
|
49
|
+
raise error
|
|
50
|
+
return result
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ProcessWorkerPoolSingleton:
|
|
54
|
+
"""
|
|
55
|
+
A singleton process worker pool using a dual-queue implementation.
|
|
56
|
+
|
|
57
|
+
Instead of a global result queue, each submitted task gets its own Pipe.
|
|
58
|
+
The submit_task() method returns a SimpleFuture, whose result() call blocks
|
|
59
|
+
until the task completes.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
_instance: Optional["ProcessWorkerPoolSingleton"] = None
|
|
63
|
+
_lock: Lock = Lock()
|
|
64
|
+
_total_workers: int = 0
|
|
65
|
+
|
|
66
|
+
def __new__(cls) -> "ProcessWorkerPoolSingleton":
|
|
67
|
+
"""
|
|
68
|
+
Create or return the singleton instance of ProcessWorkerPoolSingleton.
|
|
69
|
+
|
|
70
|
+
Returns
|
|
71
|
+
-------
|
|
72
|
+
ProcessWorkerPoolSingleton
|
|
73
|
+
The singleton instance.
|
|
74
|
+
"""
|
|
75
|
+
logger.debug("Creating ProcessWorkerPoolSingleton instance...")
|
|
76
|
+
with cls._lock:
|
|
77
|
+
if cls._instance is None:
|
|
78
|
+
max_worker_limit: int = int(os.environ.get("MAX_INGEST_PROCESS_WORKERS", -1))
|
|
79
|
+
instance = super().__new__(cls)
|
|
80
|
+
# Determine available CPU count using affinity if possible
|
|
81
|
+
available: Optional[int] = (
|
|
82
|
+
len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else os.cpu_count()
|
|
83
|
+
)
|
|
84
|
+
# Use 40% of available CPUs, ensuring at least one worker
|
|
85
|
+
max_workers: int = math.floor(max(1, available * 0.4))
|
|
86
|
+
if (max_worker_limit > 0) and (max_workers > max_worker_limit):
|
|
87
|
+
max_workers = max_worker_limit
|
|
88
|
+
logger.debug("Creating ProcessWorkerPoolSingleton instance with max workers: %d", max_workers)
|
|
89
|
+
instance._initialize(max_workers)
|
|
90
|
+
logger.debug("ProcessWorkerPoolSingleton instance created: %s", instance)
|
|
91
|
+
cls._instance = instance
|
|
92
|
+
else:
|
|
93
|
+
logger.debug("ProcessWorkerPoolSingleton instance already exists: %s", cls._instance)
|
|
94
|
+
return cls._instance
|
|
95
|
+
|
|
96
|
+
def _initialize(self, total_max_workers: int) -> None:
|
|
97
|
+
"""
|
|
98
|
+
Initialize the worker pool with the specified number of worker processes.
|
|
99
|
+
|
|
100
|
+
Parameters
|
|
101
|
+
----------
|
|
102
|
+
total_max_workers : int
|
|
103
|
+
The total number of worker processes to start.
|
|
104
|
+
"""
|
|
105
|
+
self._total_workers = total_max_workers
|
|
106
|
+
self._context: mp.context.ForkContext = mp.get_context("fork")
|
|
107
|
+
# Bounded task queue: maximum tasks queued = 2 * total_max_workers.
|
|
108
|
+
self._task_queue: mp.Queue = self._context.Queue(maxsize=2 * total_max_workers)
|
|
109
|
+
self._next_task_id: int = 0
|
|
110
|
+
self._processes: list[mp.Process] = []
|
|
111
|
+
logger.debug(
|
|
112
|
+
"Initializing ProcessWorkerPoolSingleton with %d workers and queue size %d.",
|
|
113
|
+
total_max_workers,
|
|
114
|
+
2 * total_max_workers,
|
|
115
|
+
)
|
|
116
|
+
for i in range(total_max_workers):
|
|
117
|
+
p: mp.Process = self._context.Process(target=self._worker, args=(self._task_queue,))
|
|
118
|
+
p.start()
|
|
119
|
+
self._processes.append(p)
|
|
120
|
+
logger.debug("Started worker process %d/%d: PID %d", i + 1, total_max_workers, p.pid)
|
|
121
|
+
logger.debug("Initialized with max workers: %d", total_max_workers)
|
|
122
|
+
|
|
123
|
+
@staticmethod
|
|
124
|
+
def _worker(task_queue: mp.Queue) -> None:
|
|
125
|
+
"""
|
|
126
|
+
Worker process that continuously processes tasks from the task queue.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
task_queue : mp.Queue
|
|
131
|
+
The queue from which tasks are retrieved.
|
|
132
|
+
"""
|
|
133
|
+
logger.debug("Worker process started: PID %d", os.getpid())
|
|
134
|
+
while True:
|
|
135
|
+
task = task_queue.get()
|
|
136
|
+
if task is None:
|
|
137
|
+
# Stop signal received; exit the loop.
|
|
138
|
+
logger.debug("Worker process %d received stop signal.", os.getpid())
|
|
139
|
+
break
|
|
140
|
+
# Unpack task: (task_id, process_fn, args, child_conn)
|
|
141
|
+
task_id, process_fn, args, child_conn = task
|
|
142
|
+
try:
|
|
143
|
+
result = process_fn(*args)
|
|
144
|
+
child_conn.send((result, None))
|
|
145
|
+
except Exception as e:
|
|
146
|
+
logger.error("Task %d error in worker %d: %s", task_id, os.getpid(), e)
|
|
147
|
+
child_conn.send((None, e))
|
|
148
|
+
finally:
|
|
149
|
+
child_conn.close()
|
|
150
|
+
|
|
151
|
+
def submit_task(self, process_fn: Callable, *args: Any) -> SimpleFuture:
|
|
152
|
+
"""
|
|
153
|
+
Submits a task to the worker pool for asynchronous execution.
|
|
154
|
+
|
|
155
|
+
If a single tuple is passed as the only argument, it is unpacked.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
process_fn : Callable
|
|
160
|
+
The function to be executed asynchronously.
|
|
161
|
+
*args : Any
|
|
162
|
+
The arguments to pass to the process function. If a single argument is a tuple,
|
|
163
|
+
it will be unpacked as the function arguments.
|
|
164
|
+
|
|
165
|
+
Returns
|
|
166
|
+
-------
|
|
167
|
+
SimpleFuture
|
|
168
|
+
A future object that can be used to retrieve the result of the task.
|
|
169
|
+
"""
|
|
170
|
+
# Unpack tuple if a single tuple argument is provided.
|
|
171
|
+
if len(args) == 1 and isinstance(args[0], tuple):
|
|
172
|
+
args = args[0]
|
|
173
|
+
parent_conn, child_conn = mp.Pipe(duplex=False)
|
|
174
|
+
task_id: int = self._next_task_id
|
|
175
|
+
self._next_task_id += 1
|
|
176
|
+
self._task_queue.put((task_id, process_fn, args, child_conn))
|
|
177
|
+
return SimpleFuture(parent_conn)
|
|
178
|
+
|
|
179
|
+
def close(self) -> None:
|
|
180
|
+
"""
|
|
181
|
+
Closes the worker pool and terminates all worker processes.
|
|
182
|
+
|
|
183
|
+
Sends a stop signal to each worker and waits for them to terminate.
|
|
184
|
+
"""
|
|
185
|
+
logger.debug("Closing ProcessWorkerPoolSingleton...")
|
|
186
|
+
# Send a stop signal (None) for each worker.
|
|
187
|
+
for _ in range(self._total_workers):
|
|
188
|
+
self._task_queue.put(None)
|
|
189
|
+
logger.debug("Sent stop signal to worker.")
|
|
190
|
+
# Wait for all processes to finish.
|
|
191
|
+
for i, p in enumerate(self._processes):
|
|
192
|
+
p.join()
|
|
193
|
+
logger.debug("Worker process %d/%d joined: PID %d", i + 1, self._total_workers, p.pid)
|
|
194
|
+
logger.debug("ProcessWorkerPoolSingleton closed.")
|