nv-ingest-api 2025.4.21.dev20250421__py3-none-any.whl → 2025.4.23.dev20250423__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +215 -0
- nv_ingest_api/interface/extract.py +972 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +218 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +200 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +494 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
- nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
- nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +232 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +205 -0
- nv_ingest_api/internal/transform/embed_text.py +496 -0
- nv_ingest_api/internal/transform/split_text.py +157 -0
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +223 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +179 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
- nv_ingest_api/util/image_processing/transforms.py +407 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +31 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +469 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
- nv_ingest_api/util/nim/__init__.py +56 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +427 -0
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.23.dev20250423.dist-info/RECORD +152 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/WHEEL +1 -1
- nv_ingest_api-2025.4.21.dev20250421.dist-info/RECORD +0 -9
- /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.21.dev20250421.dist-info → nv_ingest_api-2025.4.23.dev20250423.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,895 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# Copyright (c) 2024, NVIDIA CORPORATION.
|
|
7
|
+
#
|
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
9
|
+
# you may not use this file except in compliance with the License.
|
|
10
|
+
# You may obtain a copy of the License at
|
|
11
|
+
#
|
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
13
|
+
#
|
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
17
|
+
# See the License for the specific language governing permissions and
|
|
18
|
+
# limitations under the License.
|
|
19
|
+
|
|
20
|
+
# pylint: disable=line-too-long
|
|
21
|
+
# pylint: disable=too-few-public-methods
|
|
22
|
+
|
|
23
|
+
import io
|
|
24
|
+
import logging
|
|
25
|
+
import re
|
|
26
|
+
import uuid
|
|
27
|
+
from typing import Dict, Optional, Union
|
|
28
|
+
from typing import List
|
|
29
|
+
from typing import Tuple
|
|
30
|
+
|
|
31
|
+
from collections import defaultdict
|
|
32
|
+
|
|
33
|
+
import pandas as pd
|
|
34
|
+
from docx import Document
|
|
35
|
+
from docx.image.constants import MIME_TYPE
|
|
36
|
+
from docx.image.image import Image
|
|
37
|
+
from docx.oxml.table import CT_Tbl
|
|
38
|
+
from docx.oxml.text.paragraph import CT_P
|
|
39
|
+
from docx.table import Table
|
|
40
|
+
from docx.table import _Cell
|
|
41
|
+
from docx.text.hyperlink import Hyperlink
|
|
42
|
+
from docx.text.paragraph import Paragraph
|
|
43
|
+
from docx.text.run import Run
|
|
44
|
+
from pandas import DataFrame
|
|
45
|
+
|
|
46
|
+
from nv_ingest_api.internal.enums.common import ContentDescriptionEnum, DocumentTypeEnum
|
|
47
|
+
from nv_ingest_api.internal.extract.image.image_helpers.common import (
|
|
48
|
+
load_and_preprocess_image,
|
|
49
|
+
extract_page_elements_from_images,
|
|
50
|
+
)
|
|
51
|
+
from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
|
|
52
|
+
from nv_ingest_api.internal.schemas.meta.metadata_schema import (
|
|
53
|
+
ContentTypeEnum,
|
|
54
|
+
validate_metadata,
|
|
55
|
+
TextTypeEnum,
|
|
56
|
+
)
|
|
57
|
+
from nv_ingest_api.util.converters import bytetools
|
|
58
|
+
from nv_ingest_api.util.detectors.language import detect_language
|
|
59
|
+
from nv_ingest_api.util.metadata.aggregators import construct_table_and_chart_metadata, CroppedImageWithContent
|
|
60
|
+
|
|
61
|
+
PARAGRAPH_FORMATS = ["text", "markdown"]
|
|
62
|
+
TABLE_FORMATS = ["markdown", "markdown_light", "csv", "tag"]
|
|
63
|
+
|
|
64
|
+
logger = logging.getLogger(__name__)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class DocxProperties:
|
|
68
|
+
"""
|
|
69
|
+
Parse document core properties and update metadata
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(self, document: Document, source_metadata: Dict):
|
|
73
|
+
"""
|
|
74
|
+
Copy over some of the docx core properties
|
|
75
|
+
"""
|
|
76
|
+
self.document = document
|
|
77
|
+
self.source_metadata = source_metadata
|
|
78
|
+
|
|
79
|
+
# Extract core properties with None checks
|
|
80
|
+
core_properties = self.document.core_properties
|
|
81
|
+
|
|
82
|
+
# Get properties with None handling
|
|
83
|
+
self.title = core_properties.title
|
|
84
|
+
|
|
85
|
+
# Author with fallback to last_modified_by if author is None
|
|
86
|
+
self.author = core_properties.author if core_properties.author is not None else core_properties.last_modified_by
|
|
87
|
+
|
|
88
|
+
self.created = core_properties.created
|
|
89
|
+
self.modified = core_properties.modified
|
|
90
|
+
self.keywords = core_properties.keywords
|
|
91
|
+
|
|
92
|
+
self._update_source_meta_data()
|
|
93
|
+
|
|
94
|
+
def __str__(self):
|
|
95
|
+
"""
|
|
96
|
+
Print properties
|
|
97
|
+
"""
|
|
98
|
+
info = "Document Properties:\n"
|
|
99
|
+
info += f"title: {self.title}\n"
|
|
100
|
+
info += f"author: {self.author}\n"
|
|
101
|
+
|
|
102
|
+
# Handle date formatting safely
|
|
103
|
+
if self.created is not None:
|
|
104
|
+
info += f"created: {self.created.isoformat()}\n"
|
|
105
|
+
else:
|
|
106
|
+
info += "created: None\n"
|
|
107
|
+
|
|
108
|
+
if self.modified is not None:
|
|
109
|
+
info += f"modified: {self.modified.isoformat()}\n"
|
|
110
|
+
else:
|
|
111
|
+
info += "modified: None\n"
|
|
112
|
+
|
|
113
|
+
info += f"keywords: {self.keywords}\n"
|
|
114
|
+
|
|
115
|
+
return info
|
|
116
|
+
|
|
117
|
+
def _update_source_meta_data(self):
|
|
118
|
+
"""
|
|
119
|
+
Update the source metadata with the document's core properties
|
|
120
|
+
"""
|
|
121
|
+
# Only update metadata if dates are available
|
|
122
|
+
metadata_updates = {}
|
|
123
|
+
|
|
124
|
+
if self.created is not None:
|
|
125
|
+
metadata_updates["date_created"] = self.created.isoformat()
|
|
126
|
+
|
|
127
|
+
if self.modified is not None:
|
|
128
|
+
metadata_updates["last_modified"] = self.modified.isoformat()
|
|
129
|
+
|
|
130
|
+
if metadata_updates:
|
|
131
|
+
self.source_metadata.update(metadata_updates)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class DocxReader:
|
|
135
|
+
__doc__ = f"""
|
|
136
|
+
Read a docx file and extract its content as text, images and tables.
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
docx :
|
|
141
|
+
Bytestream
|
|
142
|
+
paragraph_format : str
|
|
143
|
+
Format of the paragraphs. Supported formats are: {PARAGRAPH_FORMATS}
|
|
144
|
+
table_format : str
|
|
145
|
+
Format of the tables. Supported formats are: {TABLE_FORMATS}
|
|
146
|
+
handle_text_styles : bool
|
|
147
|
+
Whether to apply style on a paragraph (heading, list, title, subtitle).
|
|
148
|
+
Not recommended if the document has been converted from pdf.
|
|
149
|
+
image_tag : str
|
|
150
|
+
Tag to replace the images in the text. Must contain one placeholder for the image index.
|
|
151
|
+
table_tag : str
|
|
152
|
+
Tag to replace the tables in the text. Must contain one placeholder for the table index.
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
def __init__(
|
|
156
|
+
self,
|
|
157
|
+
docx,
|
|
158
|
+
source_metadata: Dict,
|
|
159
|
+
paragraph_format: str = "markdown",
|
|
160
|
+
table_format: str = "markdown",
|
|
161
|
+
handle_text_styles: bool = True,
|
|
162
|
+
image_tag="<image {}>",
|
|
163
|
+
table_tag="<table {}>",
|
|
164
|
+
extraction_config: Dict = None,
|
|
165
|
+
):
|
|
166
|
+
if paragraph_format not in PARAGRAPH_FORMATS:
|
|
167
|
+
raise ValueError(f"Unknown paragraph format {paragraph_format}. Supported formats are: {PARAGRAPH_FORMATS}")
|
|
168
|
+
|
|
169
|
+
if table_format not in TABLE_FORMATS:
|
|
170
|
+
raise ValueError(f"Unknown table format {table_format}. Supported formats are: {TABLE_FORMATS}")
|
|
171
|
+
|
|
172
|
+
self.paragraph_format = paragraph_format
|
|
173
|
+
self.table_format = table_format
|
|
174
|
+
self.handle_text_styles = handle_text_styles
|
|
175
|
+
self.image_tag = image_tag
|
|
176
|
+
self.table_tag = table_tag
|
|
177
|
+
|
|
178
|
+
# Read docx
|
|
179
|
+
self.document = Document(docx)
|
|
180
|
+
|
|
181
|
+
# Get the core properties
|
|
182
|
+
self.properties = DocxProperties(self.document, source_metadata)
|
|
183
|
+
logger.debug("%s", str(self.properties))
|
|
184
|
+
|
|
185
|
+
self.trailing_space_pattern = re.compile(r"(^\s*)(.*?)(\s*$)", re.DOTALL)
|
|
186
|
+
self.empty_text_pattern = re.compile(r"^\s*$")
|
|
187
|
+
self.images = []
|
|
188
|
+
self.tables = []
|
|
189
|
+
self.image_tag_index = 1
|
|
190
|
+
self.table_tag_index = 1
|
|
191
|
+
|
|
192
|
+
# placeholders for metadata extraction
|
|
193
|
+
self._accumulated_text = []
|
|
194
|
+
self._extracted_data = []
|
|
195
|
+
self._extraction_config = extraction_config if extraction_config else {}
|
|
196
|
+
self._pending_images = []
|
|
197
|
+
self._prev_para_image_idx = 0
|
|
198
|
+
self._prev_para_images = []
|
|
199
|
+
|
|
200
|
+
def is_text_empty(self, text: str) -> bool:
|
|
201
|
+
"""
|
|
202
|
+
Check if the given text is empty or matches the empty text pattern.
|
|
203
|
+
|
|
204
|
+
Parameters
|
|
205
|
+
----------
|
|
206
|
+
text : str
|
|
207
|
+
The text to check.
|
|
208
|
+
|
|
209
|
+
Returns
|
|
210
|
+
-------
|
|
211
|
+
bool
|
|
212
|
+
True if the text is empty or matches the empty text pattern, False otherwise.
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
return self.empty_text_pattern.match(text) is not None
|
|
216
|
+
|
|
217
|
+
def format_text(self, text: str, bold: bool, italic: bool, underline: bool) -> str:
|
|
218
|
+
"""
|
|
219
|
+
Apply markdown styling (bold, italic, underline) to the given text.
|
|
220
|
+
|
|
221
|
+
Parameters
|
|
222
|
+
----------
|
|
223
|
+
text : str
|
|
224
|
+
The text to format.
|
|
225
|
+
bold : bool
|
|
226
|
+
Whether to apply bold styling.
|
|
227
|
+
italic : bool
|
|
228
|
+
Whether to apply italic styling.
|
|
229
|
+
underline : bool
|
|
230
|
+
Whether to apply underline styling.
|
|
231
|
+
|
|
232
|
+
Returns
|
|
233
|
+
-------
|
|
234
|
+
str
|
|
235
|
+
The formatted text with the applied styles.
|
|
236
|
+
"""
|
|
237
|
+
|
|
238
|
+
if self.is_text_empty(text):
|
|
239
|
+
return text
|
|
240
|
+
|
|
241
|
+
# Exclude leading and trailing spaces from style
|
|
242
|
+
match = self.trailing_space_pattern.match(text)
|
|
243
|
+
if match:
|
|
244
|
+
prefix, text, suffix = match.groups()
|
|
245
|
+
else:
|
|
246
|
+
prefix, suffix = "", ""
|
|
247
|
+
|
|
248
|
+
# Apply style
|
|
249
|
+
if bold:
|
|
250
|
+
text = f"**{text}**"
|
|
251
|
+
if italic:
|
|
252
|
+
text = f"*{text}*"
|
|
253
|
+
if underline:
|
|
254
|
+
text = f"<u>{text}</u>"
|
|
255
|
+
|
|
256
|
+
# Add back leading and trailing spaces
|
|
257
|
+
text = prefix + text + suffix
|
|
258
|
+
|
|
259
|
+
return text
|
|
260
|
+
|
|
261
|
+
def format_paragraph(self, paragraph: "Paragraph") -> Tuple[str, List["Image"]]:
|
|
262
|
+
"""
|
|
263
|
+
Format a paragraph into styled text and extract associated images.
|
|
264
|
+
|
|
265
|
+
Parameters
|
|
266
|
+
----------
|
|
267
|
+
paragraph : Paragraph
|
|
268
|
+
The paragraph to format. This includes text and potentially embedded images.
|
|
269
|
+
|
|
270
|
+
Returns
|
|
271
|
+
-------
|
|
272
|
+
tuple of (str, list of Image)
|
|
273
|
+
- The formatted paragraph text with markdown styling applied.
|
|
274
|
+
- A list of extracted images from the paragraph.
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
paragraph_images = []
|
|
278
|
+
if self.paragraph_format == "text":
|
|
279
|
+
paragraph_text = paragraph.text
|
|
280
|
+
else:
|
|
281
|
+
# Get the default style of the paragraph, "markdown"
|
|
282
|
+
font = paragraph.style.font
|
|
283
|
+
default_style = (font.bold, font.italic, font.underline)
|
|
284
|
+
|
|
285
|
+
# Iterate over the runs of the paragraph and group them by style, excluding empty runs
|
|
286
|
+
paragraph_text = ""
|
|
287
|
+
group_text = ""
|
|
288
|
+
previous_style = None
|
|
289
|
+
|
|
290
|
+
for c in paragraph.iter_inner_content():
|
|
291
|
+
if isinstance(c, Hyperlink):
|
|
292
|
+
text = f"[{c.text}]({c.address})"
|
|
293
|
+
style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
|
|
294
|
+
elif isinstance(c, Run):
|
|
295
|
+
text = c.text
|
|
296
|
+
style = (c.bold, c.italic, c.underline)
|
|
297
|
+
# 1. Locate the inline shape which is stored in the <w:drawing> element.
|
|
298
|
+
# 2. r:embed in <a.blip> has the relationship id for extracting the file where
|
|
299
|
+
# the image is stored as bytes.
|
|
300
|
+
# Reference:
|
|
301
|
+
# https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/picture.html#specimen-xml
|
|
302
|
+
inline_shapes = c._element.xpath(".//w:drawing//a:blip/@r:embed")
|
|
303
|
+
for r_id in inline_shapes:
|
|
304
|
+
text += self.image_tag.format(self.image_tag_index)
|
|
305
|
+
self.image_tag_index += 1
|
|
306
|
+
image = paragraph.part.related_parts[r_id].image
|
|
307
|
+
paragraph_images.append(image)
|
|
308
|
+
else:
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
style = tuple([s if s is not None else d for s, d in zip(style, default_style)])
|
|
312
|
+
|
|
313
|
+
# If the style changes for a non empty text, format the previous group and start a new one
|
|
314
|
+
if (not self.is_text_empty(text)) and (previous_style is not None):
|
|
315
|
+
if style != previous_style:
|
|
316
|
+
paragraph_text += self.format_text(group_text, *previous_style)
|
|
317
|
+
group_text = ""
|
|
318
|
+
|
|
319
|
+
group_text += text
|
|
320
|
+
if not self.is_text_empty(text):
|
|
321
|
+
previous_style = style
|
|
322
|
+
|
|
323
|
+
# Format the last group
|
|
324
|
+
if group_text:
|
|
325
|
+
paragraph_text += self.format_text(group_text, *style)
|
|
326
|
+
|
|
327
|
+
# Remove trailing spaces
|
|
328
|
+
paragraph_text = paragraph_text.strip()
|
|
329
|
+
return paragraph_text, paragraph_images
|
|
330
|
+
|
|
331
|
+
def format_cell(self, cell: "_Cell") -> Tuple[str, List["Image"]]:
|
|
332
|
+
"""
|
|
333
|
+
Format a table cell into Markdown text and extract associated images.
|
|
334
|
+
|
|
335
|
+
Parameters
|
|
336
|
+
----------
|
|
337
|
+
cell : _Cell
|
|
338
|
+
The table cell to format.
|
|
339
|
+
|
|
340
|
+
Returns
|
|
341
|
+
-------
|
|
342
|
+
tuple of (str, list of Image)
|
|
343
|
+
- The formatted text of the cell with markdown styling applied.
|
|
344
|
+
- A list of images extracted from the cell.
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
if self.paragraph_format == "markdown":
|
|
348
|
+
newline = "<br>"
|
|
349
|
+
else:
|
|
350
|
+
newline = "\n"
|
|
351
|
+
paragraph_texts, paragraph_images = zip(*[self.format_paragraph(p) for p in cell.paragraphs])
|
|
352
|
+
return newline.join(paragraph_texts), paragraph_images
|
|
353
|
+
|
|
354
|
+
def format_table(self, table: "Table") -> Tuple[Optional[str], List["Image"], DataFrame]:
|
|
355
|
+
"""
|
|
356
|
+
Format a table into text, extract images, and represent it as a DataFrame.
|
|
357
|
+
|
|
358
|
+
Parameters
|
|
359
|
+
----------
|
|
360
|
+
table : Table
|
|
361
|
+
The table to format.
|
|
362
|
+
|
|
363
|
+
Returns
|
|
364
|
+
-------
|
|
365
|
+
tuple of (str or None, list of Image, DataFrame)
|
|
366
|
+
- The formatted table as text, using the specified format (e.g., markdown, CSV).
|
|
367
|
+
- A list of images extracted from the table.
|
|
368
|
+
- A DataFrame representation of the table's content.
|
|
369
|
+
"""
|
|
370
|
+
|
|
371
|
+
rows = [[self.format_cell(cell) for cell in row.cells] for row in table.rows]
|
|
372
|
+
texts = [[text for text, _ in row] for row in rows]
|
|
373
|
+
table_images = [image for row in rows for _, images in row for image in images]
|
|
374
|
+
|
|
375
|
+
table = pd.DataFrame(texts[1:], columns=texts[0])
|
|
376
|
+
if "markdown" in self.table_format:
|
|
377
|
+
table_text = table.to_markdown(index=False)
|
|
378
|
+
if self.table_format == "markdown_light":
|
|
379
|
+
table_text = re.sub(r"\s{2,}", " ", table_text)
|
|
380
|
+
table_text = re.sub(r"-{2,}", "-", table_text)
|
|
381
|
+
elif self.table_format == "csv":
|
|
382
|
+
table_text = table.to_csv()
|
|
383
|
+
elif self.table_format == "tag":
|
|
384
|
+
table_text = self.table_tag.format(self.table_tag_index)
|
|
385
|
+
self.table_tag_index += 1
|
|
386
|
+
else:
|
|
387
|
+
raise ValueError(f"Unknown table format {format}")
|
|
388
|
+
|
|
389
|
+
return table_text, table_images, table
|
|
390
|
+
|
|
391
|
+
@staticmethod
|
|
392
|
+
def apply_text_style(style: str, text: str, level: int = 0) -> str:
|
|
393
|
+
"""
|
|
394
|
+
Apply a specific text style (e.g., heading, list, title, subtitle) to the given text.
|
|
395
|
+
|
|
396
|
+
Parameters
|
|
397
|
+
----------
|
|
398
|
+
style : str
|
|
399
|
+
The style to apply. Supported styles include headings ("Heading 1" to "Heading 9"),
|
|
400
|
+
list items ("List"), and document structures ("Title", "Subtitle").
|
|
401
|
+
text : str
|
|
402
|
+
The text to style.
|
|
403
|
+
level : int, optional
|
|
404
|
+
The indentation level for the styled text. Default is 0.
|
|
405
|
+
|
|
406
|
+
Returns
|
|
407
|
+
-------
|
|
408
|
+
str
|
|
409
|
+
The text with the specified style and indentation applied.
|
|
410
|
+
"""
|
|
411
|
+
|
|
412
|
+
if re.match(r"^Heading [1-9]$", style):
|
|
413
|
+
n = int(style.split(" ")[-1])
|
|
414
|
+
text = f"{'#' * n} {text}"
|
|
415
|
+
elif style.startswith("List"):
|
|
416
|
+
text = f"- {text}"
|
|
417
|
+
elif style == "Title":
|
|
418
|
+
text = f"{text}\n{'=' * len(text)}"
|
|
419
|
+
elif style == "Subtitle":
|
|
420
|
+
text = f"{text}\n{'-' * len(text)}"
|
|
421
|
+
|
|
422
|
+
text = "\t" * level + text
|
|
423
|
+
|
|
424
|
+
return text
|
|
425
|
+
|
|
426
|
+
@staticmethod
|
|
427
|
+
def docx_content_type_to_image_type(content_type: "MIME_TYPE") -> str:
|
|
428
|
+
"""
|
|
429
|
+
Convert a DOCX content type string to an image type.
|
|
430
|
+
|
|
431
|
+
Parameters
|
|
432
|
+
----------
|
|
433
|
+
content_type : MIME_TYPE
|
|
434
|
+
The content type string from the image header, e.g., "image/jpeg".
|
|
435
|
+
|
|
436
|
+
Returns
|
|
437
|
+
-------
|
|
438
|
+
str
|
|
439
|
+
The image type extracted from the content type string.
|
|
440
|
+
"""
|
|
441
|
+
|
|
442
|
+
return content_type.split("/")[1]
|
|
443
|
+
|
|
444
|
+
def _construct_image_metadata(
|
|
445
|
+
self, para_idx: int, caption: str, base_unified_metadata: Dict, base64_img: str
|
|
446
|
+
) -> List[Union[str, dict]]:
|
|
447
|
+
"""
|
|
448
|
+
Build metadata for an image in a DOCX file.
|
|
449
|
+
|
|
450
|
+
Parameters
|
|
451
|
+
----------
|
|
452
|
+
para_idx : int
|
|
453
|
+
The paragraph index containing the image.
|
|
454
|
+
caption : str
|
|
455
|
+
The caption associated with the image.
|
|
456
|
+
base_unified_metadata : dict
|
|
457
|
+
The base metadata to build upon.
|
|
458
|
+
base64_img : str
|
|
459
|
+
The image content encoded as a base64 string.
|
|
460
|
+
|
|
461
|
+
Returns
|
|
462
|
+
-------
|
|
463
|
+
list
|
|
464
|
+
A list containing the content type, validated metadata, and a unique identifier.
|
|
465
|
+
"""
|
|
466
|
+
|
|
467
|
+
bbox = (0, 0, 0, 0)
|
|
468
|
+
caption_len = len(caption.splitlines())
|
|
469
|
+
|
|
470
|
+
page_idx = 0 # docx => single page
|
|
471
|
+
page_count = 1
|
|
472
|
+
|
|
473
|
+
page_nearby_blocks = {
|
|
474
|
+
"text": {"content": [], "bbox": []},
|
|
475
|
+
"images": {"content": [], "bbox": []},
|
|
476
|
+
"structured": {"content": [], "bbox": []},
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
if caption_len:
|
|
480
|
+
page_nearby_blocks["text"]["content"].append(caption)
|
|
481
|
+
page_nearby_blocks["text"]["bbox"] = [[-1, -1, -1, -1]] * caption_len
|
|
482
|
+
|
|
483
|
+
content_metadata = {
|
|
484
|
+
"type": ContentTypeEnum.IMAGE,
|
|
485
|
+
"description": ContentDescriptionEnum.DOCX_IMAGE,
|
|
486
|
+
"page_number": page_idx,
|
|
487
|
+
"hierarchy": {
|
|
488
|
+
"page_count": page_count,
|
|
489
|
+
"page": page_idx,
|
|
490
|
+
"block": para_idx,
|
|
491
|
+
"line": -1,
|
|
492
|
+
"span": -1,
|
|
493
|
+
"nearby_objects": page_nearby_blocks,
|
|
494
|
+
},
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
image_metadata = {
|
|
498
|
+
"image_type": DocumentTypeEnum.PNG,
|
|
499
|
+
"structured_image_type": ContentTypeEnum.NONE,
|
|
500
|
+
"caption": caption,
|
|
501
|
+
"text": "",
|
|
502
|
+
"image_location": bbox,
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
unified_metadata = base_unified_metadata.copy()
|
|
506
|
+
unified_metadata.update(
|
|
507
|
+
{
|
|
508
|
+
"content": base64_img,
|
|
509
|
+
"source_metadata": self.properties.source_metadata,
|
|
510
|
+
"content_metadata": content_metadata,
|
|
511
|
+
"image_metadata": image_metadata,
|
|
512
|
+
}
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
validated_unified_metadata = validate_metadata(unified_metadata)
|
|
516
|
+
|
|
517
|
+
return [
|
|
518
|
+
ContentTypeEnum.IMAGE.value,
|
|
519
|
+
validated_unified_metadata.model_dump(),
|
|
520
|
+
str(uuid.uuid4()),
|
|
521
|
+
]
|
|
522
|
+
|
|
523
|
+
def _extract_para_images(
|
|
524
|
+
self, images: List["Image"], para_idx: int, caption: str, base_unified_metadata: Dict
|
|
525
|
+
) -> None:
|
|
526
|
+
"""
|
|
527
|
+
Collect images from a paragraph and store them for metadata construction.
|
|
528
|
+
|
|
529
|
+
Parameters
|
|
530
|
+
----------
|
|
531
|
+
images : list of Image
|
|
532
|
+
The images found in the paragraph.
|
|
533
|
+
para_idx : int
|
|
534
|
+
The index of the paragraph containing the images.
|
|
535
|
+
caption : str
|
|
536
|
+
The caption associated with the images.
|
|
537
|
+
base_unified_metadata : dict
|
|
538
|
+
The base metadata to associate with the images.
|
|
539
|
+
|
|
540
|
+
Returns
|
|
541
|
+
-------
|
|
542
|
+
None
|
|
543
|
+
"""
|
|
544
|
+
|
|
545
|
+
for image in images:
|
|
546
|
+
logger.debug("image content_type %s para_idx %d", image.content_type, para_idx)
|
|
547
|
+
logger.debug("image caption %s", caption)
|
|
548
|
+
|
|
549
|
+
# Simply append a tuple so we can build the final metadata in _finalize_images
|
|
550
|
+
self._pending_images.append((image, para_idx, caption, base_unified_metadata))
|
|
551
|
+
|
|
552
|
+
def _construct_text_metadata(
|
|
553
|
+
self, accumulated_text: List[str], para_idx: int, text_depth: "TextTypeEnum", base_unified_metadata: Dict
|
|
554
|
+
) -> List[Union[str, dict]]:
|
|
555
|
+
"""
|
|
556
|
+
Build metadata for text content in a DOCX file.
|
|
557
|
+
|
|
558
|
+
Parameters
|
|
559
|
+
----------
|
|
560
|
+
accumulated_text : list of str
|
|
561
|
+
The accumulated text to include in the metadata.
|
|
562
|
+
para_idx : int
|
|
563
|
+
The paragraph index containing the text.
|
|
564
|
+
text_depth : TextTypeEnum
|
|
565
|
+
The depth of the text content (e.g., page-level, paragraph-level).
|
|
566
|
+
base_unified_metadata : dict
|
|
567
|
+
The base metadata to build upon.
|
|
568
|
+
|
|
569
|
+
Returns
|
|
570
|
+
-------
|
|
571
|
+
list
|
|
572
|
+
A list containing the content type, validated metadata, and a unique identifier.
|
|
573
|
+
"""
|
|
574
|
+
|
|
575
|
+
if len(accumulated_text) < 1:
|
|
576
|
+
return []
|
|
577
|
+
|
|
578
|
+
extracted_text = " ".join(accumulated_text)
|
|
579
|
+
|
|
580
|
+
# the document is treated as a single page
|
|
581
|
+
page_number = 0 if text_depth == TextTypeEnum.PAGE else -1
|
|
582
|
+
content_metadata = {
|
|
583
|
+
"type": ContentTypeEnum.TEXT,
|
|
584
|
+
"description": ContentDescriptionEnum.DOCX_TEXT,
|
|
585
|
+
"page_number": page_number,
|
|
586
|
+
"hierarchy": {
|
|
587
|
+
"page_count": 1,
|
|
588
|
+
"page": page_number,
|
|
589
|
+
"block": para_idx,
|
|
590
|
+
"line": -1,
|
|
591
|
+
"span": -1,
|
|
592
|
+
},
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
language = detect_language(extracted_text)
|
|
596
|
+
text_metadata = {
|
|
597
|
+
"text_type": text_depth,
|
|
598
|
+
"summary": "",
|
|
599
|
+
"keywords": self.properties.keywords,
|
|
600
|
+
"language": language,
|
|
601
|
+
"text_location": (-1, -1, -1, -1),
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
ext_unified_metadata = base_unified_metadata.copy() if base_unified_metadata else {}
|
|
605
|
+
ext_unified_metadata.update(
|
|
606
|
+
{
|
|
607
|
+
"content": extracted_text,
|
|
608
|
+
"source_metadata": self.properties.source_metadata,
|
|
609
|
+
"content_metadata": content_metadata,
|
|
610
|
+
"text_metadata": text_metadata,
|
|
611
|
+
}
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
validated_unified_metadata = validate_metadata(ext_unified_metadata)
|
|
615
|
+
|
|
616
|
+
return [ContentTypeEnum.TEXT.value, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|
|
617
|
+
|
|
618
|
+
def _extract_para_text(
|
|
619
|
+
self,
|
|
620
|
+
paragraph,
|
|
621
|
+
paragraph_text,
|
|
622
|
+
base_unified_metadata: Dict,
|
|
623
|
+
text_depth: str,
|
|
624
|
+
para_idx: int,
|
|
625
|
+
) -> None:
|
|
626
|
+
"""
|
|
627
|
+
Process the text, images, and styles in a DOCX paragraph.
|
|
628
|
+
|
|
629
|
+
Parameters
|
|
630
|
+
----------
|
|
631
|
+
paragraph: Paragraph
|
|
632
|
+
The paragraph to process.
|
|
633
|
+
paragraph_text: str
|
|
634
|
+
The text content of the paragraph.
|
|
635
|
+
base_unified_metadata : dict
|
|
636
|
+
The base metadata to associate with extracted data.
|
|
637
|
+
text_depth : TextTypeEnum
|
|
638
|
+
The depth of text extraction (e.g., block-level, document-level).
|
|
639
|
+
para_idx : int
|
|
640
|
+
The index of the paragraph being processed.
|
|
641
|
+
|
|
642
|
+
Returns
|
|
643
|
+
-------
|
|
644
|
+
None
|
|
645
|
+
"""
|
|
646
|
+
|
|
647
|
+
# Handle text styles if desired
|
|
648
|
+
if self.handle_text_styles:
|
|
649
|
+
try:
|
|
650
|
+
numPr = paragraph._element.xpath("./w:pPr/w:numPr")[0]
|
|
651
|
+
level = int(numPr.xpath("./w:ilvl/@w:val")[0])
|
|
652
|
+
except Exception:
|
|
653
|
+
level = -1
|
|
654
|
+
paragraph_text = self.apply_text_style(paragraph.style.name, paragraph_text, level)
|
|
655
|
+
|
|
656
|
+
self._accumulated_text.append(paragraph_text + "\n")
|
|
657
|
+
|
|
658
|
+
# If text_depth is BLOCK, we flush after each paragraph
|
|
659
|
+
if text_depth == TextTypeEnum.BLOCK:
|
|
660
|
+
text_extraction = self._construct_text_metadata(
|
|
661
|
+
self._accumulated_text, para_idx, text_depth, base_unified_metadata
|
|
662
|
+
)
|
|
663
|
+
self._extracted_data.append(text_extraction)
|
|
664
|
+
self._accumulated_text = []
|
|
665
|
+
|
|
666
|
+
def _finalize_images(self, extract_tables: bool, extract_charts: bool, **kwargs) -> None:
|
|
667
|
+
"""
|
|
668
|
+
Build and append final metadata for each pending image in batches.
|
|
669
|
+
|
|
670
|
+
Parameters
|
|
671
|
+
----------
|
|
672
|
+
extract_tables : bool
|
|
673
|
+
Whether to attempt table detection in images.
|
|
674
|
+
extract_charts : bool
|
|
675
|
+
Whether to attempt chart detection in images.
|
|
676
|
+
**kwargs
|
|
677
|
+
Additional configuration for image processing.
|
|
678
|
+
|
|
679
|
+
Returns
|
|
680
|
+
-------
|
|
681
|
+
None
|
|
682
|
+
"""
|
|
683
|
+
if not self._pending_images:
|
|
684
|
+
return
|
|
685
|
+
|
|
686
|
+
# 1) Convert all pending images into numpy arrays (and also store base64 + context),
|
|
687
|
+
# so we can run detection on them in one go.
|
|
688
|
+
all_image_arrays = []
|
|
689
|
+
image_info = [] # parallel list to hold (para_idx, caption, base_unified_metadata, base64_img)
|
|
690
|
+
|
|
691
|
+
for docx_image, para_idx, caption, base_unified_metadata in self._pending_images:
|
|
692
|
+
# Convert docx image blob to BytesIO, then to numpy array
|
|
693
|
+
image_bytes = docx_image.blob
|
|
694
|
+
image_stream = io.BytesIO(image_bytes)
|
|
695
|
+
image_array = load_and_preprocess_image(image_stream)
|
|
696
|
+
base64_img = str(bytetools.base64frombytes(image_bytes))
|
|
697
|
+
|
|
698
|
+
all_image_arrays.append(image_array)
|
|
699
|
+
|
|
700
|
+
# Keep track of all needed metadata so we can rebuild final entries
|
|
701
|
+
image_info.append((para_idx, caption, base_unified_metadata, base64_img))
|
|
702
|
+
|
|
703
|
+
# 2) If the user wants to detect tables/charts, do it in one pass for all images.
|
|
704
|
+
detection_map = defaultdict(list) # maps image_index -> list of CroppedImageWithContent
|
|
705
|
+
|
|
706
|
+
if extract_tables or extract_charts:
|
|
707
|
+
try:
|
|
708
|
+
# Perform the batched detection on all images
|
|
709
|
+
detection_results = extract_page_elements_from_images(
|
|
710
|
+
images=all_image_arrays,
|
|
711
|
+
config=ImageConfigSchema(**self._extraction_config.model_dump()),
|
|
712
|
+
trace_info=kwargs.get("trace_info"),
|
|
713
|
+
)
|
|
714
|
+
# detection_results is typically List[Tuple[int, CroppedImageWithContent]]
|
|
715
|
+
# Group by image_index
|
|
716
|
+
for image_idx, cropped_item in detection_results:
|
|
717
|
+
detection_map[image_idx].append(cropped_item)
|
|
718
|
+
|
|
719
|
+
except Exception as e:
|
|
720
|
+
logger.error(f"Error extracting tables/charts in batch: {e}")
|
|
721
|
+
# If something goes wrong, we can fall back to empty detection map
|
|
722
|
+
# so that all images are treated normally
|
|
723
|
+
detection_map = {}
|
|
724
|
+
|
|
725
|
+
# 3) For each pending image, decide if we found tables/charts or not.
|
|
726
|
+
for i, _ in enumerate(self._pending_images):
|
|
727
|
+
para_idx_i, caption_i, base_unified_metadata_i, base64_img_i = image_info[i]
|
|
728
|
+
|
|
729
|
+
# If detection_map[i] is non-empty, we have found table(s)/chart(s).
|
|
730
|
+
if i in detection_map and detection_map[i]:
|
|
731
|
+
for table_chart_data in detection_map[i]:
|
|
732
|
+
# Build structured metadata for each table or chart
|
|
733
|
+
structured_entry = construct_table_and_chart_metadata(
|
|
734
|
+
structured_image=table_chart_data, # A CroppedImageWithContent
|
|
735
|
+
page_idx=0, # docx => single page
|
|
736
|
+
page_count=1,
|
|
737
|
+
source_metadata=self.properties.source_metadata,
|
|
738
|
+
base_unified_metadata=base_unified_metadata_i,
|
|
739
|
+
)
|
|
740
|
+
self._extracted_data.append(structured_entry)
|
|
741
|
+
else:
|
|
742
|
+
# Either detection was not requested, or no table/chart was found
|
|
743
|
+
image_entry = self._construct_image_metadata(
|
|
744
|
+
para_idx_i,
|
|
745
|
+
caption_i,
|
|
746
|
+
base_unified_metadata_i,
|
|
747
|
+
base64_img_i,
|
|
748
|
+
)
|
|
749
|
+
self._extracted_data.append(image_entry)
|
|
750
|
+
|
|
751
|
+
# 4) Clear out the pending images after finalizing
|
|
752
|
+
self._pending_images = []
|
|
753
|
+
|
|
754
|
+
def _extract_table_data(
|
|
755
|
+
self,
|
|
756
|
+
child,
|
|
757
|
+
base_unified_metadata: Dict,
|
|
758
|
+
) -> None:
|
|
759
|
+
"""
|
|
760
|
+
Process the text and images in a DOCX table.
|
|
761
|
+
|
|
762
|
+
Parameters
|
|
763
|
+
----------
|
|
764
|
+
child : element
|
|
765
|
+
The table element to process.
|
|
766
|
+
base_unified_metadata : dict
|
|
767
|
+
The base metadata to associate with extracted data.
|
|
768
|
+
text_depth : TextTypeEnum
|
|
769
|
+
The depth of text extraction (e.g., block-level, document-level).
|
|
770
|
+
para_idx : int
|
|
771
|
+
The index of the table being processed.
|
|
772
|
+
|
|
773
|
+
Returns
|
|
774
|
+
-------
|
|
775
|
+
None
|
|
776
|
+
"""
|
|
777
|
+
|
|
778
|
+
# Table
|
|
779
|
+
table = Table(child, self.document)
|
|
780
|
+
table_text, table_images, table_dataframe = self.format_table(table)
|
|
781
|
+
|
|
782
|
+
self.images += table_images
|
|
783
|
+
self.tables.append(table_dataframe)
|
|
784
|
+
|
|
785
|
+
cropped_image_with_content = CroppedImageWithContent(
|
|
786
|
+
content=table_text,
|
|
787
|
+
image="", # no image content
|
|
788
|
+
bbox=(0, 0, 0, 0),
|
|
789
|
+
max_width=0,
|
|
790
|
+
max_height=0,
|
|
791
|
+
type_string="table",
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
self._extracted_data.append(
|
|
795
|
+
construct_table_and_chart_metadata(
|
|
796
|
+
structured_image=cropped_image_with_content,
|
|
797
|
+
page_idx=0, # docx => single page
|
|
798
|
+
page_count=1,
|
|
799
|
+
source_metadata=self.properties.source_metadata,
|
|
800
|
+
base_unified_metadata=base_unified_metadata,
|
|
801
|
+
)
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
def extract_data(
|
|
805
|
+
self,
|
|
806
|
+
base_unified_metadata: Dict,
|
|
807
|
+
text_depth: "TextTypeEnum",
|
|
808
|
+
extract_text: bool,
|
|
809
|
+
extract_charts: bool,
|
|
810
|
+
extract_tables: bool,
|
|
811
|
+
extract_images: bool,
|
|
812
|
+
) -> list[list[str | dict]]:
|
|
813
|
+
"""
|
|
814
|
+
Iterate over paragraphs and tables in a DOCX document to extract data.
|
|
815
|
+
|
|
816
|
+
Parameters
|
|
817
|
+
----------
|
|
818
|
+
base_unified_metadata : dict
|
|
819
|
+
The base metadata to associate with all extracted content.
|
|
820
|
+
text_depth : TextTypeEnum
|
|
821
|
+
The depth of text extraction (e.g., block-level, document-level).
|
|
822
|
+
extract_text : bool
|
|
823
|
+
Whether to extract text from the document.
|
|
824
|
+
extract_charts : bool
|
|
825
|
+
Whether to extract charts from the document.
|
|
826
|
+
extract_tables : bool
|
|
827
|
+
Whether to extract tables from the document.
|
|
828
|
+
extract_images : bool
|
|
829
|
+
Whether to extract images from the document.
|
|
830
|
+
|
|
831
|
+
Returns
|
|
832
|
+
-------
|
|
833
|
+
dict
|
|
834
|
+
A dictionary containing the extracted data from the document.
|
|
835
|
+
"""
|
|
836
|
+
|
|
837
|
+
self._accumulated_text = []
|
|
838
|
+
self._extracted_data = []
|
|
839
|
+
self._pending_images = []
|
|
840
|
+
self._prev_para_images = []
|
|
841
|
+
self._prev_para_image_idx = 0
|
|
842
|
+
|
|
843
|
+
para_idx = 0
|
|
844
|
+
|
|
845
|
+
for child in self.document.element.body.iterchildren():
|
|
846
|
+
if isinstance(child, CT_P):
|
|
847
|
+
paragraph = Paragraph(child, self.document)
|
|
848
|
+
paragraph_text, paragraph_images = self.format_paragraph(paragraph)
|
|
849
|
+
|
|
850
|
+
if extract_text:
|
|
851
|
+
self._extract_para_text(
|
|
852
|
+
paragraph,
|
|
853
|
+
paragraph_text,
|
|
854
|
+
base_unified_metadata,
|
|
855
|
+
text_depth,
|
|
856
|
+
para_idx,
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
if (extract_charts or extract_images or extract_tables) and paragraph_images:
|
|
860
|
+
self._prev_para_images = paragraph_images
|
|
861
|
+
self._prev_para_image_idx = para_idx
|
|
862
|
+
self._pending_images += [(image, para_idx, "", base_unified_metadata) for image in paragraph_images]
|
|
863
|
+
self.images += paragraph_images
|
|
864
|
+
|
|
865
|
+
elif isinstance(child, CT_Tbl):
|
|
866
|
+
if extract_tables or extract_charts:
|
|
867
|
+
self._extract_table_data(child, base_unified_metadata)
|
|
868
|
+
|
|
869
|
+
para_idx += 1
|
|
870
|
+
|
|
871
|
+
# If there's leftover text at the doc’s end
|
|
872
|
+
if (
|
|
873
|
+
extract_text
|
|
874
|
+
and text_depth in (TextTypeEnum.DOCUMENT, TextTypeEnum.PAGE)
|
|
875
|
+
and len(self._accumulated_text) > 0
|
|
876
|
+
):
|
|
877
|
+
text_extraction = self._construct_text_metadata(
|
|
878
|
+
self._accumulated_text,
|
|
879
|
+
-1,
|
|
880
|
+
text_depth,
|
|
881
|
+
base_unified_metadata,
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
if text_extraction:
|
|
885
|
+
self._extracted_data.append(text_extraction)
|
|
886
|
+
|
|
887
|
+
# Final pass: Decide if images are just images or contain tables/charts
|
|
888
|
+
if extract_images or extract_tables or extract_charts:
|
|
889
|
+
self._finalize_images(
|
|
890
|
+
extract_tables=extract_tables,
|
|
891
|
+
extract_charts=extract_charts,
|
|
892
|
+
trace_info=None,
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
return self._extracted_data
|