nv-ingest-api 2025.4.18.dev20250418__py3-none-any.whl → 2025.4.19.dev20250419__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +0 -3
- nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
- nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.19.dev20250419.dist-info/RECORD +9 -0
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/WHEEL +1 -1
- nv_ingest_api/interface/__init__.py +0 -215
- nv_ingest_api/interface/extract.py +0 -972
- nv_ingest_api/interface/mutate.py +0 -154
- nv_ingest_api/interface/store.py +0 -218
- nv_ingest_api/interface/transform.py +0 -382
- nv_ingest_api/interface/utility.py +0 -200
- nv_ingest_api/internal/enums/__init__.py +0 -3
- nv_ingest_api/internal/enums/common.py +0 -494
- nv_ingest_api/internal/extract/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
- nv_ingest_api/internal/extract/docx/__init__.py +0 -5
- nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
- nv_ingest_api/internal/extract/image/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
- nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
- nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
- nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
- nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
- nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
- nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
- nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
- nv_ingest_api/internal/mutate/__init__.py +0 -3
- nv_ingest_api/internal/mutate/deduplicate.py +0 -110
- nv_ingest_api/internal/mutate/filter.py +0 -133
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
- nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
- nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
- nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
- nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
- nv_ingest_api/internal/schemas/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
- nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
- nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
- nv_ingest_api/internal/schemas/store/__init__.py +0 -3
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
- nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
- nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
- nv_ingest_api/internal/store/__init__.py +0 -3
- nv_ingest_api/internal/store/embed_text_upload.py +0 -236
- nv_ingest_api/internal/store/image_upload.py +0 -232
- nv_ingest_api/internal/transform/__init__.py +0 -3
- nv_ingest_api/internal/transform/caption_image.py +0 -205
- nv_ingest_api/internal/transform/embed_text.py +0 -496
- nv_ingest_api/internal/transform/split_text.py +0 -157
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +0 -47
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +0 -78
- nv_ingest_api/util/converters/containers.py +0 -65
- nv_ingest_api/util/converters/datetools.py +0 -90
- nv_ingest_api/util/converters/dftools.py +0 -127
- nv_ingest_api/util/converters/formats.py +0 -64
- nv_ingest_api/util/converters/type_mappings.py +0 -27
- nv_ingest_api/util/detectors/__init__.py +0 -5
- nv_ingest_api/util/detectors/language.py +0 -38
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +0 -72
- nv_ingest_api/util/exception_handlers/decorators.py +0 -223
- nv_ingest_api/util/exception_handlers/detectors.py +0 -74
- nv_ingest_api/util/exception_handlers/pdf.py +0 -116
- nv_ingest_api/util/exception_handlers/schemas.py +0 -68
- nv_ingest_api/util/image_processing/__init__.py +0 -5
- nv_ingest_api/util/image_processing/clustering.py +0 -260
- nv_ingest_api/util/image_processing/processing.py +0 -179
- nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
- nv_ingest_api/util/image_processing/transforms.py +0 -407
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +0 -31
- nv_ingest_api/util/message_brokers/__init__.py +0 -3
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -451
- nv_ingest_api/util/metadata/__init__.py +0 -5
- nv_ingest_api/util/metadata/aggregators.py +0 -469
- nv_ingest_api/util/multi_processing/__init__.py +0 -8
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
- nv_ingest_api/util/nim/__init__.py +0 -56
- nv_ingest_api/util/pdf/__init__.py +0 -3
- nv_ingest_api/util/pdf/pdfium.py +0 -427
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +0 -10
- nv_ingest_api/util/service_clients/__init__.py +0 -3
- nv_ingest_api/util/service_clients/client_base.py +0 -86
- nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +0 -823
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +0 -531
- nv_ingest_api/util/string_processing/__init__.py +0 -51
- nv_ingest_api-2025.4.18.dev20250418.dist-info/RECORD +0 -152
- /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/top_level.txt +0 -0
|
@@ -1,799 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
# Copyright (c) 2024, NVIDIA CORPORATION.
|
|
5
|
-
#
|
|
6
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
-
# you may not use this file except in compliance with the License.
|
|
8
|
-
# You may obtain a copy of the License at
|
|
9
|
-
#
|
|
10
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
-
#
|
|
12
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
-
# See the License for the specific language governing permissions and
|
|
16
|
-
# limitations under the License.
|
|
17
|
-
|
|
18
|
-
import logging
|
|
19
|
-
import io
|
|
20
|
-
import operator
|
|
21
|
-
import re
|
|
22
|
-
import uuid
|
|
23
|
-
from collections import defaultdict
|
|
24
|
-
from datetime import datetime
|
|
25
|
-
from typing import Dict, List, Tuple, IO
|
|
26
|
-
from typing import Optional
|
|
27
|
-
|
|
28
|
-
import pandas as pd
|
|
29
|
-
from pptx import Presentation
|
|
30
|
-
from pptx.enum.dml import MSO_COLOR_TYPE
|
|
31
|
-
from pptx.enum.dml import MSO_THEME_COLOR
|
|
32
|
-
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
33
|
-
from pptx.enum.shapes import PP_PLACEHOLDER
|
|
34
|
-
from pptx.shapes.autoshape import Shape
|
|
35
|
-
from pptx.slide import Slide
|
|
36
|
-
|
|
37
|
-
from nv_ingest_api.internal.enums.common import AccessLevelEnum, DocumentTypeEnum
|
|
38
|
-
from nv_ingest_api.internal.enums.common import ContentTypeEnum
|
|
39
|
-
from nv_ingest_api.internal.enums.common import ContentDescriptionEnum
|
|
40
|
-
from nv_ingest_api.internal.enums.common import TableFormatEnum
|
|
41
|
-
from nv_ingest_api.internal.enums.common import TextTypeEnum
|
|
42
|
-
from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
|
|
43
|
-
from nv_ingest_api.internal.extract.image.image_helpers.common import (
|
|
44
|
-
load_and_preprocess_image,
|
|
45
|
-
extract_page_elements_from_images,
|
|
46
|
-
)
|
|
47
|
-
from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
|
|
48
|
-
from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXConfigSchema
|
|
49
|
-
from nv_ingest_api.util.converters import bytetools
|
|
50
|
-
from nv_ingest_api.util.detectors.language import detect_language
|
|
51
|
-
from nv_ingest_api.util.metadata.aggregators import construct_page_element_metadata
|
|
52
|
-
|
|
53
|
-
logger = logging.getLogger(__name__)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def _finalize_images(
|
|
57
|
-
pending_images: List[Tuple[Shape, int, int, int, dict, dict, dict]],
|
|
58
|
-
extracted_data: List,
|
|
59
|
-
pptx_extraction_config: PPTXConfigSchema,
|
|
60
|
-
extract_tables: bool = False,
|
|
61
|
-
extract_charts: bool = False,
|
|
62
|
-
trace_info: Optional[Dict] = None,
|
|
63
|
-
):
|
|
64
|
-
"""
|
|
65
|
-
Post-process all pending images.
|
|
66
|
-
- Convert shape image -> NumPy or base64
|
|
67
|
-
- If `extract_tables` or `extract_charts`, do detection (table/chart)
|
|
68
|
-
- Build the appropriate metadata, either table/chart or image.
|
|
69
|
-
|
|
70
|
-
This mimics the docx approach, but adapted for python-pptx shapes.
|
|
71
|
-
"""
|
|
72
|
-
if not pending_images:
|
|
73
|
-
return
|
|
74
|
-
|
|
75
|
-
# Convert each shape to image data (base64 or ndarray).
|
|
76
|
-
# We'll store them for a single call to your model if you'd like (batching).
|
|
77
|
-
image_arrays = []
|
|
78
|
-
image_contexts = []
|
|
79
|
-
for (
|
|
80
|
-
shape,
|
|
81
|
-
shape_idx,
|
|
82
|
-
slide_idx,
|
|
83
|
-
slide_count,
|
|
84
|
-
page_nearby_blocks,
|
|
85
|
-
source_metadata,
|
|
86
|
-
base_unified_metadata,
|
|
87
|
-
) in pending_images:
|
|
88
|
-
try:
|
|
89
|
-
image_bytes = shape.image.blob
|
|
90
|
-
image_array = load_and_preprocess_image(io.BytesIO(image_bytes))
|
|
91
|
-
base64_img = bytetools.base64frombytes(image_bytes)
|
|
92
|
-
|
|
93
|
-
image_arrays.append(image_array)
|
|
94
|
-
image_contexts.append(
|
|
95
|
-
(
|
|
96
|
-
shape_idx,
|
|
97
|
-
slide_idx,
|
|
98
|
-
slide_count,
|
|
99
|
-
page_nearby_blocks,
|
|
100
|
-
source_metadata,
|
|
101
|
-
base_unified_metadata,
|
|
102
|
-
base64_img,
|
|
103
|
-
)
|
|
104
|
-
)
|
|
105
|
-
except Exception as e:
|
|
106
|
-
logger.warning(f"Unable to process shape image: {e}")
|
|
107
|
-
|
|
108
|
-
# If you want table/chart detection for these images, do it now
|
|
109
|
-
# (similar to docx approach). This might use your YOLO or other method:
|
|
110
|
-
detection_map = defaultdict(list) # image_idx -> list of CroppedImageWithContent
|
|
111
|
-
if extract_tables or extract_charts:
|
|
112
|
-
try:
|
|
113
|
-
# For example, a call to your function that checks for tables/charts
|
|
114
|
-
detection_results = extract_page_elements_from_images(
|
|
115
|
-
images=image_arrays,
|
|
116
|
-
config=ImageConfigSchema(**(pptx_extraction_config.model_dump())),
|
|
117
|
-
trace_info=trace_info,
|
|
118
|
-
)
|
|
119
|
-
# detection_results is something like [(image_idx, CroppedImageWithContent), ...]
|
|
120
|
-
for img_idx, cropped_obj in detection_results:
|
|
121
|
-
detection_map[img_idx].append(cropped_obj)
|
|
122
|
-
except Exception as e:
|
|
123
|
-
logger.error(f"Error while running table/chart detection on PPTX images: {e}")
|
|
124
|
-
detection_map = {}
|
|
125
|
-
|
|
126
|
-
# Now build the final metadata objects
|
|
127
|
-
for i, context in enumerate(image_contexts):
|
|
128
|
-
(shape_idx, slide_idx, slide_count, page_nearby_blocks, source_metadata, base_unified_metadata, base64_img) = (
|
|
129
|
-
context
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
# If there's a detection result for this image, handle it
|
|
133
|
-
if i in detection_map and detection_map[i]:
|
|
134
|
-
# We found table(s)/chart(s) in the image
|
|
135
|
-
for cropped_item in detection_map[i]:
|
|
136
|
-
structured_entry = construct_page_element_metadata(
|
|
137
|
-
structured_image=cropped_item,
|
|
138
|
-
page_idx=slide_idx,
|
|
139
|
-
page_count=slide_count,
|
|
140
|
-
source_metadata=source_metadata,
|
|
141
|
-
base_unified_metadata=base_unified_metadata,
|
|
142
|
-
)
|
|
143
|
-
extracted_data.append(structured_entry)
|
|
144
|
-
else:
|
|
145
|
-
# No table detected => build normal image metadata
|
|
146
|
-
image_entry = _construct_image_metadata(
|
|
147
|
-
shape_idx=shape_idx,
|
|
148
|
-
slide_idx=slide_idx,
|
|
149
|
-
slide_count=slide_count,
|
|
150
|
-
page_nearby_blocks=page_nearby_blocks,
|
|
151
|
-
base64_img=base64_img,
|
|
152
|
-
source_metadata=source_metadata,
|
|
153
|
-
base_unified_metadata=base_unified_metadata,
|
|
154
|
-
)
|
|
155
|
-
extracted_data.append(image_entry)
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
# -----------------------------------------------------------------------------
|
|
159
|
-
# Helper Function: Recursive Image Extraction
|
|
160
|
-
# -----------------------------------------------------------------------------
|
|
161
|
-
def process_shape(
|
|
162
|
-
shape, shape_idx, slide_idx, slide_count, pending_images, page_nearby_blocks, source_metadata, base_unified_metadata
|
|
163
|
-
):
|
|
164
|
-
"""
|
|
165
|
-
Recursively process a shape:
|
|
166
|
-
- If the shape is a group, iterate over its child shapes.
|
|
167
|
-
- If it is a picture or a placeholder with an embedded image, append it to pending_images.
|
|
168
|
-
"""
|
|
169
|
-
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
|
170
|
-
for sub_idx, sub_shape in enumerate(shape.shapes):
|
|
171
|
-
# Create a composite index (e.g., "2.1" for the first child of shape 2)
|
|
172
|
-
composite_idx = f"{shape_idx}.{sub_idx}"
|
|
173
|
-
process_shape(
|
|
174
|
-
sub_shape,
|
|
175
|
-
composite_idx,
|
|
176
|
-
slide_idx,
|
|
177
|
-
slide_count,
|
|
178
|
-
pending_images,
|
|
179
|
-
page_nearby_blocks,
|
|
180
|
-
source_metadata,
|
|
181
|
-
base_unified_metadata,
|
|
182
|
-
)
|
|
183
|
-
else:
|
|
184
|
-
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
|
|
185
|
-
shape.is_placeholder and shape.placeholder_format.type == PP_PLACEHOLDER.OBJECT and hasattr(shape, "image")
|
|
186
|
-
):
|
|
187
|
-
try:
|
|
188
|
-
pending_images.append(
|
|
189
|
-
(
|
|
190
|
-
shape, # so we can later pull shape.image.blob
|
|
191
|
-
shape_idx,
|
|
192
|
-
slide_idx,
|
|
193
|
-
slide_count,
|
|
194
|
-
page_nearby_blocks,
|
|
195
|
-
source_metadata,
|
|
196
|
-
base_unified_metadata,
|
|
197
|
-
)
|
|
198
|
-
)
|
|
199
|
-
except Exception as e:
|
|
200
|
-
logger.warning(f"Error processing shape {shape_idx} on slide {slide_idx}: {e}")
|
|
201
|
-
raise
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
# -----------------------------------------------------------------------------
|
|
205
|
-
# Main Extraction Function
|
|
206
|
-
# -----------------------------------------------------------------------------
|
|
207
|
-
def python_pptx(
|
|
208
|
-
*,
|
|
209
|
-
pptx_stream: IO,
|
|
210
|
-
extract_text: bool,
|
|
211
|
-
extract_images: bool,
|
|
212
|
-
extract_infographics: bool,
|
|
213
|
-
extract_tables: bool,
|
|
214
|
-
extract_charts: bool,
|
|
215
|
-
extraction_config: dict,
|
|
216
|
-
execution_trace_log: Optional[List] = None,
|
|
217
|
-
):
|
|
218
|
-
"""
|
|
219
|
-
Uses python-pptx to extract text from a PPTX bytestream, while deferring image
|
|
220
|
-
classification into tables/charts if requested.
|
|
221
|
-
"""
|
|
222
|
-
|
|
223
|
-
_ = extract_infographics # Placeholder for future use
|
|
224
|
-
_ = execution_trace_log # Placeholder for future use
|
|
225
|
-
|
|
226
|
-
row_data = extraction_config.get("row_data")
|
|
227
|
-
source_id = row_data["source_id"]
|
|
228
|
-
|
|
229
|
-
text_depth = extraction_config.get("text_depth", "page")
|
|
230
|
-
text_depth = TextTypeEnum[text_depth.upper()]
|
|
231
|
-
|
|
232
|
-
paragraph_format = extraction_config.get("paragraph_format", "markdown")
|
|
233
|
-
identify_nearby_objects = extraction_config.get("identify_nearby_objects", True)
|
|
234
|
-
|
|
235
|
-
metadata_col = extraction_config.get("metadata_column", "metadata")
|
|
236
|
-
pptx_extractor_config = extraction_config.get("pptx_extraction_config", {})
|
|
237
|
-
trace_info = extraction_config.get("trace_info", {})
|
|
238
|
-
|
|
239
|
-
base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}
|
|
240
|
-
base_source_metadata = base_unified_metadata.get("source_metadata", {})
|
|
241
|
-
source_location = base_source_metadata.get("source_location", "")
|
|
242
|
-
collection_id = base_source_metadata.get("collection_id", "")
|
|
243
|
-
partition_id = base_source_metadata.get("partition_id", -1)
|
|
244
|
-
access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
|
|
245
|
-
|
|
246
|
-
presentation = Presentation(pptx_stream)
|
|
247
|
-
|
|
248
|
-
# Collect source metadata from the core properties of the document.
|
|
249
|
-
last_modified = (
|
|
250
|
-
presentation.core_properties.modified.isoformat()
|
|
251
|
-
if presentation.core_properties.modified
|
|
252
|
-
else datetime.now().isoformat()
|
|
253
|
-
)
|
|
254
|
-
date_created = (
|
|
255
|
-
presentation.core_properties.created.isoformat()
|
|
256
|
-
if presentation.core_properties.created
|
|
257
|
-
else datetime.now().isoformat()
|
|
258
|
-
)
|
|
259
|
-
keywords = presentation.core_properties.keywords
|
|
260
|
-
source_type = DocumentTypeEnum.PPTX
|
|
261
|
-
source_metadata = {
|
|
262
|
-
"source_name": source_id, # python-pptx doesn't maintain filename; re-use source_id
|
|
263
|
-
"source_id": source_id,
|
|
264
|
-
"source_location": source_location,
|
|
265
|
-
"source_type": source_type,
|
|
266
|
-
"collection_id": collection_id,
|
|
267
|
-
"date_created": date_created,
|
|
268
|
-
"last_modified": last_modified,
|
|
269
|
-
"summary": "",
|
|
270
|
-
"partition_id": partition_id,
|
|
271
|
-
"access_level": access_level,
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
slide_count = len(presentation.slides)
|
|
275
|
-
|
|
276
|
-
accumulated_text = []
|
|
277
|
-
extracted_data = []
|
|
278
|
-
|
|
279
|
-
# Hold images here for final classification.
|
|
280
|
-
# Each item is (shape, shape_idx, slide_idx, slide_count, page_nearby_blocks, source_metadata,
|
|
281
|
-
# base_unified_metadata)
|
|
282
|
-
pending_images = []
|
|
283
|
-
|
|
284
|
-
for slide_idx, slide in enumerate(presentation.slides):
|
|
285
|
-
# Obtain a flat list of shapes (ungrouped) sorted by top then left.
|
|
286
|
-
shapes = sorted(ungroup_shapes(slide.shapes), key=operator.attrgetter("top", "left"))
|
|
287
|
-
|
|
288
|
-
page_nearby_blocks = {
|
|
289
|
-
"text": {"content": [], "bbox": []},
|
|
290
|
-
"images": {"content": [], "bbox": []},
|
|
291
|
-
"structured": {"content": [], "bbox": []},
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
for shape_idx, shape in enumerate(shapes):
|
|
295
|
-
block_text = []
|
|
296
|
-
added_title = added_subtitle = False
|
|
297
|
-
|
|
298
|
-
# ---------------------------------------------
|
|
299
|
-
# 1) Text Extraction
|
|
300
|
-
# ---------------------------------------------
|
|
301
|
-
if extract_text and shape.has_text_frame:
|
|
302
|
-
for paragraph_idx, paragraph in enumerate(shape.text_frame.paragraphs):
|
|
303
|
-
if not paragraph.text.strip():
|
|
304
|
-
continue
|
|
305
|
-
|
|
306
|
-
for run_idx, run in enumerate(paragraph.runs):
|
|
307
|
-
text = run.text
|
|
308
|
-
if not text:
|
|
309
|
-
continue
|
|
310
|
-
|
|
311
|
-
text = escape_text(text)
|
|
312
|
-
|
|
313
|
-
if paragraph_format == "markdown":
|
|
314
|
-
if is_title(shape):
|
|
315
|
-
if not added_title:
|
|
316
|
-
text = process_title(shape)
|
|
317
|
-
added_title = True
|
|
318
|
-
else:
|
|
319
|
-
continue
|
|
320
|
-
elif is_subtitle(shape):
|
|
321
|
-
if not added_subtitle:
|
|
322
|
-
text = process_subtitle(shape)
|
|
323
|
-
added_subtitle = True
|
|
324
|
-
else:
|
|
325
|
-
continue
|
|
326
|
-
else:
|
|
327
|
-
if run.hyperlink.address:
|
|
328
|
-
text = get_hyperlink(text, run.hyperlink.address)
|
|
329
|
-
if is_accent(paragraph.font) or is_accent(run.font):
|
|
330
|
-
text = format_text(text, italic=True)
|
|
331
|
-
elif is_strong(paragraph.font) or is_strong(run.font):
|
|
332
|
-
text = format_text(text, bold=True)
|
|
333
|
-
elif is_underlined(paragraph.font) or is_underlined(run.font):
|
|
334
|
-
text = format_text(text, underline=True)
|
|
335
|
-
if is_list_block(shape):
|
|
336
|
-
text = " " * paragraph.level + "* " + text
|
|
337
|
-
|
|
338
|
-
accumulated_text.append(text)
|
|
339
|
-
|
|
340
|
-
# For "nearby objects", store block text.
|
|
341
|
-
if extract_images and identify_nearby_objects:
|
|
342
|
-
block_text.append(text)
|
|
343
|
-
|
|
344
|
-
# If we only want text at SPAN level, flush after each run.
|
|
345
|
-
if text_depth == TextTypeEnum.SPAN:
|
|
346
|
-
text_extraction = _construct_text_metadata(
|
|
347
|
-
presentation,
|
|
348
|
-
shape,
|
|
349
|
-
accumulated_text,
|
|
350
|
-
keywords,
|
|
351
|
-
slide_idx,
|
|
352
|
-
shape_idx,
|
|
353
|
-
paragraph_idx,
|
|
354
|
-
run_idx,
|
|
355
|
-
slide_count,
|
|
356
|
-
text_depth,
|
|
357
|
-
source_metadata,
|
|
358
|
-
base_unified_metadata,
|
|
359
|
-
)
|
|
360
|
-
if len(text_extraction) > 0:
|
|
361
|
-
extracted_data.append(text_extraction)
|
|
362
|
-
accumulated_text = []
|
|
363
|
-
|
|
364
|
-
# Add newlines for separation at line/paragraph level.
|
|
365
|
-
if accumulated_text and not accumulated_text[-1].endswith("\n\n"):
|
|
366
|
-
accumulated_text.append("\n\n")
|
|
367
|
-
|
|
368
|
-
if text_depth == TextTypeEnum.LINE:
|
|
369
|
-
text_extraction = _construct_text_metadata(
|
|
370
|
-
presentation,
|
|
371
|
-
shape,
|
|
372
|
-
accumulated_text,
|
|
373
|
-
keywords,
|
|
374
|
-
slide_idx,
|
|
375
|
-
shape_idx,
|
|
376
|
-
paragraph_idx,
|
|
377
|
-
-1,
|
|
378
|
-
slide_count,
|
|
379
|
-
text_depth,
|
|
380
|
-
source_metadata,
|
|
381
|
-
base_unified_metadata,
|
|
382
|
-
)
|
|
383
|
-
if len(text_extraction) > 0:
|
|
384
|
-
extracted_data.append(text_extraction)
|
|
385
|
-
accumulated_text = []
|
|
386
|
-
|
|
387
|
-
if text_depth == TextTypeEnum.BLOCK:
|
|
388
|
-
text_extraction = _construct_text_metadata(
|
|
389
|
-
presentation,
|
|
390
|
-
shape,
|
|
391
|
-
accumulated_text,
|
|
392
|
-
keywords,
|
|
393
|
-
slide_idx,
|
|
394
|
-
shape_idx,
|
|
395
|
-
-1,
|
|
396
|
-
-1,
|
|
397
|
-
slide_count,
|
|
398
|
-
text_depth,
|
|
399
|
-
source_metadata,
|
|
400
|
-
base_unified_metadata,
|
|
401
|
-
)
|
|
402
|
-
if len(text_extraction) > 0:
|
|
403
|
-
extracted_data.append(text_extraction)
|
|
404
|
-
accumulated_text = []
|
|
405
|
-
|
|
406
|
-
if extract_images and identify_nearby_objects and block_text:
|
|
407
|
-
page_nearby_blocks["text"]["content"].append("".join(block_text))
|
|
408
|
-
page_nearby_blocks["text"]["bbox"].append(get_bbox(shape_object=shape))
|
|
409
|
-
|
|
410
|
-
# ---------------------------------------------
|
|
411
|
-
# 2) Image Handling (DEFERRED) with nested/group shapes
|
|
412
|
-
# ---------------------------------------------
|
|
413
|
-
if extract_images:
|
|
414
|
-
process_shape(
|
|
415
|
-
shape,
|
|
416
|
-
shape_idx,
|
|
417
|
-
slide_idx,
|
|
418
|
-
slide_count,
|
|
419
|
-
pending_images,
|
|
420
|
-
page_nearby_blocks,
|
|
421
|
-
source_metadata,
|
|
422
|
-
base_unified_metadata,
|
|
423
|
-
)
|
|
424
|
-
|
|
425
|
-
# ---------------------------------------------
|
|
426
|
-
# 3) Table Handling
|
|
427
|
-
# ---------------------------------------------
|
|
428
|
-
if extract_tables and shape.has_table:
|
|
429
|
-
table_extraction = _construct_table_metadata(
|
|
430
|
-
shape, slide_idx, slide_count, source_metadata, base_unified_metadata
|
|
431
|
-
)
|
|
432
|
-
extracted_data.append(table_extraction)
|
|
433
|
-
|
|
434
|
-
if extract_text and (text_depth == TextTypeEnum.PAGE) and (len(accumulated_text) > 0):
|
|
435
|
-
text_extraction = _construct_text_metadata(
|
|
436
|
-
presentation,
|
|
437
|
-
shape, # may pass None if preferred
|
|
438
|
-
accumulated_text,
|
|
439
|
-
keywords,
|
|
440
|
-
slide_idx,
|
|
441
|
-
-1,
|
|
442
|
-
-1,
|
|
443
|
-
-1,
|
|
444
|
-
slide_count,
|
|
445
|
-
text_depth,
|
|
446
|
-
source_metadata,
|
|
447
|
-
base_unified_metadata,
|
|
448
|
-
)
|
|
449
|
-
if len(text_extraction) > 0:
|
|
450
|
-
extracted_data.append(text_extraction)
|
|
451
|
-
accumulated_text = []
|
|
452
|
-
|
|
453
|
-
if extract_text and (text_depth == TextTypeEnum.DOCUMENT) and (len(accumulated_text) > 0):
|
|
454
|
-
text_extraction = _construct_text_metadata(
|
|
455
|
-
presentation,
|
|
456
|
-
shape, # may pass None
|
|
457
|
-
accumulated_text,
|
|
458
|
-
keywords,
|
|
459
|
-
-1,
|
|
460
|
-
-1,
|
|
461
|
-
-1,
|
|
462
|
-
-1,
|
|
463
|
-
slide_count,
|
|
464
|
-
text_depth,
|
|
465
|
-
source_metadata,
|
|
466
|
-
base_unified_metadata,
|
|
467
|
-
)
|
|
468
|
-
if len(text_extraction) > 0:
|
|
469
|
-
extracted_data.append(text_extraction)
|
|
470
|
-
accumulated_text = []
|
|
471
|
-
|
|
472
|
-
# ---------------------------------------------
|
|
473
|
-
# FINAL STEP: Finalize images (and tables/charts)
|
|
474
|
-
# ---------------------------------------------
|
|
475
|
-
if extract_images or extract_tables or extract_charts:
|
|
476
|
-
_finalize_images(
|
|
477
|
-
pending_images,
|
|
478
|
-
extracted_data,
|
|
479
|
-
pptx_extractor_config,
|
|
480
|
-
extract_tables=extract_tables,
|
|
481
|
-
extract_charts=extract_charts,
|
|
482
|
-
trace_info=trace_info,
|
|
483
|
-
)
|
|
484
|
-
|
|
485
|
-
return extracted_data
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
def _construct_text_metadata(
|
|
489
|
-
presentation_object,
|
|
490
|
-
shape_object,
|
|
491
|
-
accumulated_text,
|
|
492
|
-
keywords,
|
|
493
|
-
slide_idx,
|
|
494
|
-
shape_idx,
|
|
495
|
-
paragraph_idx,
|
|
496
|
-
run_idx,
|
|
497
|
-
slide_count,
|
|
498
|
-
text_depth,
|
|
499
|
-
source_metadata,
|
|
500
|
-
base_unified_metadata,
|
|
501
|
-
):
|
|
502
|
-
extracted_text = "".join(accumulated_text)
|
|
503
|
-
|
|
504
|
-
content_metadata = {
|
|
505
|
-
"type": ContentTypeEnum.TEXT,
|
|
506
|
-
"description": ContentDescriptionEnum.PPTX_TEXT,
|
|
507
|
-
"page_number": slide_idx,
|
|
508
|
-
"hierarchy": {
|
|
509
|
-
"page_count": slide_count,
|
|
510
|
-
"page": slide_idx,
|
|
511
|
-
"block": shape_idx,
|
|
512
|
-
"line": paragraph_idx,
|
|
513
|
-
"span": run_idx,
|
|
514
|
-
},
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
language = detect_language(extracted_text)
|
|
518
|
-
bbox = get_bbox(
|
|
519
|
-
presentation_object=presentation_object,
|
|
520
|
-
shape_object=shape_object,
|
|
521
|
-
text_depth=text_depth,
|
|
522
|
-
)
|
|
523
|
-
|
|
524
|
-
text_metadata = {
|
|
525
|
-
"text_type": text_depth,
|
|
526
|
-
"summary": "",
|
|
527
|
-
"keywords": keywords,
|
|
528
|
-
"language": language,
|
|
529
|
-
"text_location": bbox,
|
|
530
|
-
}
|
|
531
|
-
|
|
532
|
-
ext_unified_metadata = base_unified_metadata.copy()
|
|
533
|
-
|
|
534
|
-
ext_unified_metadata.update(
|
|
535
|
-
{
|
|
536
|
-
"content": extracted_text,
|
|
537
|
-
"source_metadata": source_metadata,
|
|
538
|
-
"content_metadata": content_metadata,
|
|
539
|
-
"text_metadata": text_metadata,
|
|
540
|
-
}
|
|
541
|
-
)
|
|
542
|
-
|
|
543
|
-
validated_unified_metadata = validate_metadata(ext_unified_metadata)
|
|
544
|
-
|
|
545
|
-
return [ContentTypeEnum.TEXT, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
# need to add block text to hierarchy/nearby_objects, including bbox
|
|
549
|
-
def _construct_image_metadata(
|
|
550
|
-
shape_idx: int,
|
|
551
|
-
slide_idx: int,
|
|
552
|
-
slide_count: int,
|
|
553
|
-
page_nearby_blocks: Dict,
|
|
554
|
-
base64_img: str,
|
|
555
|
-
source_metadata: Dict,
|
|
556
|
-
base_unified_metadata: Dict,
|
|
557
|
-
):
|
|
558
|
-
"""
|
|
559
|
-
Build standard PPTX image metadata.
|
|
560
|
-
"""
|
|
561
|
-
# Example bounding box
|
|
562
|
-
bbox = (0, 0, 0, 0) # or extract from shape.left, shape.top, shape.width, shape.height if desired
|
|
563
|
-
|
|
564
|
-
content_metadata = {
|
|
565
|
-
"type": ContentTypeEnum.IMAGE,
|
|
566
|
-
"description": ContentDescriptionEnum.PPTX_IMAGE,
|
|
567
|
-
"page_number": slide_idx,
|
|
568
|
-
"hierarchy": {
|
|
569
|
-
"page_count": slide_count,
|
|
570
|
-
"page": slide_idx,
|
|
571
|
-
"block": shape_idx,
|
|
572
|
-
"line": -1,
|
|
573
|
-
"span": -1,
|
|
574
|
-
"nearby_objects": page_nearby_blocks,
|
|
575
|
-
},
|
|
576
|
-
}
|
|
577
|
-
|
|
578
|
-
image_metadata = {
|
|
579
|
-
"image_type": DocumentTypeEnum.PNG,
|
|
580
|
-
"structured_image_type": ContentTypeEnum.UNKNOWN,
|
|
581
|
-
"caption": "", # could attempt to guess a caption from nearby text
|
|
582
|
-
"text": "",
|
|
583
|
-
"image_location": bbox,
|
|
584
|
-
}
|
|
585
|
-
|
|
586
|
-
unified_metadata = base_unified_metadata.copy() if base_unified_metadata else {}
|
|
587
|
-
unified_metadata.update(
|
|
588
|
-
{
|
|
589
|
-
"content": base64_img,
|
|
590
|
-
"source_metadata": source_metadata,
|
|
591
|
-
"content_metadata": content_metadata,
|
|
592
|
-
"image_metadata": image_metadata,
|
|
593
|
-
}
|
|
594
|
-
)
|
|
595
|
-
|
|
596
|
-
validated_unified_metadata = validate_metadata(unified_metadata)
|
|
597
|
-
|
|
598
|
-
return [
|
|
599
|
-
ContentTypeEnum.IMAGE.value,
|
|
600
|
-
validated_unified_metadata.model_dump(),
|
|
601
|
-
str(uuid.uuid4()),
|
|
602
|
-
]
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
def _construct_table_metadata(
|
|
606
|
-
shape,
|
|
607
|
-
slide_idx: int,
|
|
608
|
-
slide_count: int,
|
|
609
|
-
source_metadata: Dict,
|
|
610
|
-
base_unified_metadata: Dict,
|
|
611
|
-
):
|
|
612
|
-
table = [[cell.text for cell in row.cells] for row in shape.table.rows]
|
|
613
|
-
df = pd.DataFrame(table[1:], columns=table[0])
|
|
614
|
-
# As df is eventually converted to markdown,
|
|
615
|
-
# remove any newlines, tabs, or extra spaces from the column names
|
|
616
|
-
df.columns = df.columns.str.replace(r"\s+", " ", regex=True)
|
|
617
|
-
|
|
618
|
-
bbox = get_bbox(shape_object=shape)
|
|
619
|
-
|
|
620
|
-
content_metadata = {
|
|
621
|
-
"type": ContentTypeEnum.STRUCTURED,
|
|
622
|
-
"description": ContentDescriptionEnum.PPTX_TABLE,
|
|
623
|
-
"page_number": slide_idx,
|
|
624
|
-
"hierarchy": {
|
|
625
|
-
"page_count": slide_count,
|
|
626
|
-
"page": slide_idx,
|
|
627
|
-
"line": -1,
|
|
628
|
-
"span": -1,
|
|
629
|
-
},
|
|
630
|
-
"subtype": ContentTypeEnum.TABLE,
|
|
631
|
-
}
|
|
632
|
-
table_metadata = {
|
|
633
|
-
"caption": "",
|
|
634
|
-
"table_format": TableFormatEnum.MARKDOWN,
|
|
635
|
-
"table_location": bbox,
|
|
636
|
-
"table_content": df.to_markdown(index=False),
|
|
637
|
-
}
|
|
638
|
-
ext_unified_metadata = base_unified_metadata.copy()
|
|
639
|
-
|
|
640
|
-
ext_unified_metadata.update(
|
|
641
|
-
{
|
|
642
|
-
"content": "",
|
|
643
|
-
"source_metadata": source_metadata,
|
|
644
|
-
"content_metadata": content_metadata,
|
|
645
|
-
"table_metadata": table_metadata,
|
|
646
|
-
}
|
|
647
|
-
)
|
|
648
|
-
|
|
649
|
-
validated_unified_metadata = validate_metadata(ext_unified_metadata)
|
|
650
|
-
|
|
651
|
-
return [ContentTypeEnum.STRUCTURED, validated_unified_metadata.model_dump(), str(uuid.uuid4())]
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
def get_bbox(
|
|
655
|
-
presentation_object: Optional[Presentation] = None,
|
|
656
|
-
shape_object: Optional[Slide] = None,
|
|
657
|
-
text_depth: Optional[TextTypeEnum] = None,
|
|
658
|
-
):
|
|
659
|
-
bbox = (-1, -1, -1, -1)
|
|
660
|
-
if text_depth == TextTypeEnum.DOCUMENT:
|
|
661
|
-
bbox = (-1, -1, -1, -1)
|
|
662
|
-
elif text_depth == TextTypeEnum.PAGE:
|
|
663
|
-
top = left = 0
|
|
664
|
-
width = presentation_object.slide_width
|
|
665
|
-
height = presentation_object.slide_height
|
|
666
|
-
bbox = (top, left, top + height, left + width)
|
|
667
|
-
elif shape_object:
|
|
668
|
-
top = shape_object.top
|
|
669
|
-
left = shape_object.left
|
|
670
|
-
width = shape_object.width
|
|
671
|
-
height = shape_object.height
|
|
672
|
-
bbox = (top, left, top + height, left + width)
|
|
673
|
-
return bbox
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
def ungroup_shapes(shapes):
|
|
677
|
-
result = []
|
|
678
|
-
for shape in shapes:
|
|
679
|
-
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
|
680
|
-
result.extend(ungroup_shapes(shape.shapes))
|
|
681
|
-
else:
|
|
682
|
-
result.append(shape)
|
|
683
|
-
return result
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
def is_title(shape):
|
|
687
|
-
if shape.is_placeholder and (
|
|
688
|
-
shape.placeholder_format.type == PP_PLACEHOLDER.TITLE
|
|
689
|
-
or shape.placeholder_format.type == PP_PLACEHOLDER.VERTICAL_TITLE
|
|
690
|
-
or shape.placeholder_format.type == PP_PLACEHOLDER.CENTER_TITLE
|
|
691
|
-
):
|
|
692
|
-
return True
|
|
693
|
-
else:
|
|
694
|
-
return False
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
def process_title(shape):
|
|
698
|
-
title = shape.text_frame.text.strip()
|
|
699
|
-
extracted_text = f"{title}\n{'=' * len(title)}"
|
|
700
|
-
return extracted_text
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
def is_subtitle(shape):
|
|
704
|
-
if shape.is_placeholder and (shape.placeholder_format.type == PP_PLACEHOLDER.SUBTITLE):
|
|
705
|
-
return True
|
|
706
|
-
else:
|
|
707
|
-
return False
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
def process_subtitle(shape):
|
|
711
|
-
subtitle = shape.text_frame.text.strip()
|
|
712
|
-
extracted_text = f"{subtitle}\n{'-' * len(subtitle)}"
|
|
713
|
-
return extracted_text
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
def is_list_block(shape):
|
|
717
|
-
levels = set()
|
|
718
|
-
for paragraph in shape.text_frame.paragraphs:
|
|
719
|
-
if paragraph.level not in levels:
|
|
720
|
-
levels.add(paragraph.level)
|
|
721
|
-
if paragraph.level != 0 or len(levels) > 1:
|
|
722
|
-
return True
|
|
723
|
-
return False
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
def escape_text(text):
|
|
727
|
-
def escape_repl(match_obj):
|
|
728
|
-
return "\\" + match_obj.group(0)
|
|
729
|
-
|
|
730
|
-
escape_regex_1 = re.compile(r"([\\\*`!_\{\}\[\]\(\)#\+-\.])")
|
|
731
|
-
escape_regex_2 = re.compile(r"(<[^>]+>)")
|
|
732
|
-
text = re.sub(escape_regex_1, escape_repl, text)
|
|
733
|
-
text = re.sub(escape_regex_2, escape_repl, text)
|
|
734
|
-
|
|
735
|
-
return text
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
def get_hyperlink(text, url):
|
|
739
|
-
result = f"[{text}]({url})"
|
|
740
|
-
return result
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
def is_accent(font):
|
|
744
|
-
if font.italic or (
|
|
745
|
-
font.color.type == MSO_COLOR_TYPE.SCHEME
|
|
746
|
-
and (
|
|
747
|
-
font.color.theme_color == MSO_THEME_COLOR.ACCENT_1
|
|
748
|
-
or font.color.theme_color == MSO_THEME_COLOR.ACCENT_2
|
|
749
|
-
or font.color.theme_color == MSO_THEME_COLOR.ACCENT_3
|
|
750
|
-
or font.color.theme_color == MSO_THEME_COLOR.ACCENT_4
|
|
751
|
-
or font.color.theme_color == MSO_THEME_COLOR.ACCENT_5
|
|
752
|
-
or font.color.theme_color == MSO_THEME_COLOR.ACCENT_6
|
|
753
|
-
)
|
|
754
|
-
):
|
|
755
|
-
return True
|
|
756
|
-
else:
|
|
757
|
-
return False
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
def is_underlined(font):
|
|
761
|
-
if font.underline:
|
|
762
|
-
return True
|
|
763
|
-
else:
|
|
764
|
-
return False
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
def format_text(text: str, bold: bool = False, italic: bool = False, underline: bool = False) -> str:
|
|
768
|
-
if not text.strip():
|
|
769
|
-
return text
|
|
770
|
-
|
|
771
|
-
prefix, suffix = "", ""
|
|
772
|
-
# Exclude leading and trailing spaces from style
|
|
773
|
-
trailing_space_pattern = re.compile(r"(^\s*)(.*?)(\s*$)", re.DOTALL)
|
|
774
|
-
match = trailing_space_pattern.match(text)
|
|
775
|
-
if match:
|
|
776
|
-
prefix, text, suffix = match.groups()
|
|
777
|
-
|
|
778
|
-
# Apply style
|
|
779
|
-
if bold:
|
|
780
|
-
text = f"**{text}**"
|
|
781
|
-
if italic:
|
|
782
|
-
text = f"*{text}*"
|
|
783
|
-
if underline:
|
|
784
|
-
text = f"<u>{text}</u>"
|
|
785
|
-
|
|
786
|
-
# Add back leading and trailing spaces
|
|
787
|
-
text = prefix + text + suffix
|
|
788
|
-
|
|
789
|
-
return text
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
def is_strong(font):
|
|
793
|
-
if font.bold or (
|
|
794
|
-
font.color.type == MSO_COLOR_TYPE.SCHEME
|
|
795
|
-
and (font.color.theme_color == MSO_THEME_COLOR.DARK_1 or font.color.theme_color == MSO_THEME_COLOR.DARK_2)
|
|
796
|
-
):
|
|
797
|
-
return True
|
|
798
|
-
else:
|
|
799
|
-
return False
|