nv-ingest-api 2025.4.18.dev20250418__py3-none-any.whl → 2025.4.19.dev20250419__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +0 -3
- nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
- nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/METADATA +1 -1
- nv_ingest_api-2025.4.19.dev20250419.dist-info/RECORD +9 -0
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/WHEEL +1 -1
- nv_ingest_api/interface/__init__.py +0 -215
- nv_ingest_api/interface/extract.py +0 -972
- nv_ingest_api/interface/mutate.py +0 -154
- nv_ingest_api/interface/store.py +0 -218
- nv_ingest_api/interface/transform.py +0 -382
- nv_ingest_api/interface/utility.py +0 -200
- nv_ingest_api/internal/enums/__init__.py +0 -3
- nv_ingest_api/internal/enums/common.py +0 -494
- nv_ingest_api/internal/extract/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/__init__.py +0 -3
- nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
- nv_ingest_api/internal/extract/docx/__init__.py +0 -5
- nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
- nv_ingest_api/internal/extract/image/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
- nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
- nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
- nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
- nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
- nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
- nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
- nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
- nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
- nv_ingest_api/internal/mutate/__init__.py +0 -3
- nv_ingest_api/internal/mutate/deduplicate.py +0 -110
- nv_ingest_api/internal/mutate/filter.py +0 -133
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
- nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
- nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
- nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
- nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
- nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
- nv_ingest_api/internal/schemas/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
- nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
- nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
- nv_ingest_api/internal/schemas/store/__init__.py +0 -3
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
- nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
- nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
- nv_ingest_api/internal/store/__init__.py +0 -3
- nv_ingest_api/internal/store/embed_text_upload.py +0 -236
- nv_ingest_api/internal/store/image_upload.py +0 -232
- nv_ingest_api/internal/transform/__init__.py +0 -3
- nv_ingest_api/internal/transform/caption_image.py +0 -205
- nv_ingest_api/internal/transform/embed_text.py +0 -496
- nv_ingest_api/internal/transform/split_text.py +0 -157
- nv_ingest_api/util/__init__.py +0 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +0 -47
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +0 -78
- nv_ingest_api/util/converters/containers.py +0 -65
- nv_ingest_api/util/converters/datetools.py +0 -90
- nv_ingest_api/util/converters/dftools.py +0 -127
- nv_ingest_api/util/converters/formats.py +0 -64
- nv_ingest_api/util/converters/type_mappings.py +0 -27
- nv_ingest_api/util/detectors/__init__.py +0 -5
- nv_ingest_api/util/detectors/language.py +0 -38
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +0 -72
- nv_ingest_api/util/exception_handlers/decorators.py +0 -223
- nv_ingest_api/util/exception_handlers/detectors.py +0 -74
- nv_ingest_api/util/exception_handlers/pdf.py +0 -116
- nv_ingest_api/util/exception_handlers/schemas.py +0 -68
- nv_ingest_api/util/image_processing/__init__.py +0 -5
- nv_ingest_api/util/image_processing/clustering.py +0 -260
- nv_ingest_api/util/image_processing/processing.py +0 -179
- nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
- nv_ingest_api/util/image_processing/transforms.py +0 -407
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +0 -31
- nv_ingest_api/util/message_brokers/__init__.py +0 -3
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -451
- nv_ingest_api/util/metadata/__init__.py +0 -5
- nv_ingest_api/util/metadata/aggregators.py +0 -469
- nv_ingest_api/util/multi_processing/__init__.py +0 -8
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
- nv_ingest_api/util/nim/__init__.py +0 -56
- nv_ingest_api/util/pdf/__init__.py +0 -3
- nv_ingest_api/util/pdf/pdfium.py +0 -427
- nv_ingest_api/util/schema/__init__.py +0 -0
- nv_ingest_api/util/schema/schema_validator.py +0 -10
- nv_ingest_api/util/service_clients/__init__.py +0 -3
- nv_ingest_api/util/service_clients/client_base.py +0 -86
- nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
- nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +0 -823
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +0 -531
- nv_ingest_api/util/string_processing/__init__.py +0 -51
- nv_ingest_api-2025.4.18.dev20250418.dist-info/RECORD +0 -152
- /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/top_level.txt +0 -0
|
@@ -1,449 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
-
# All rights reserved.
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
import logging
|
|
7
|
-
import re
|
|
8
|
-
|
|
9
|
-
import numpy as np
|
|
10
|
-
import pandas as pd
|
|
11
|
-
from sklearn.cluster import DBSCAN
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
logger = logging.getLogger(__name__)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def process_yolox_graphic_elements(yolox_text_dict):
|
|
18
|
-
"""
|
|
19
|
-
Process the inference results from yolox-graphic-elements model.
|
|
20
|
-
|
|
21
|
-
Parameters
|
|
22
|
-
----------
|
|
23
|
-
yolox_text : str
|
|
24
|
-
The result from the yolox model inference.
|
|
25
|
-
|
|
26
|
-
Returns
|
|
27
|
-
-------
|
|
28
|
-
str
|
|
29
|
-
The concatenated and processed chart content as a string.
|
|
30
|
-
"""
|
|
31
|
-
chart_content = ""
|
|
32
|
-
|
|
33
|
-
chart_content += yolox_text_dict.get("chart_title", "")
|
|
34
|
-
|
|
35
|
-
chart_content += " " + yolox_text_dict.get("caption", "")
|
|
36
|
-
chart_content += " " + yolox_text_dict.get("x_title", "")
|
|
37
|
-
chart_content += " " + yolox_text_dict.get("xlabel", "")
|
|
38
|
-
chart_content += " " + yolox_text_dict.get("y_title", "")
|
|
39
|
-
chart_content += " " + yolox_text_dict.get("ylabel", "")
|
|
40
|
-
chart_content += " " + yolox_text_dict.get("legend_label", "")
|
|
41
|
-
chart_content += " " + yolox_text_dict.get("legend_title", "")
|
|
42
|
-
chart_content += " " + yolox_text_dict.get("mark_label", "")
|
|
43
|
-
chart_content += " " + yolox_text_dict.get("value_label", "")
|
|
44
|
-
chart_content += " " + yolox_text_dict.get("other", "")
|
|
45
|
-
|
|
46
|
-
return chart_content.strip()
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
|
|
50
|
-
"""
|
|
51
|
-
Associates a yolox-graphic-elements box to PaddleOCR bboxes, by taking overlapping boxes.
|
|
52
|
-
Criterion is iou > max_iou / delta where max_iou is the biggest found overlap.
|
|
53
|
-
Boxes are expeceted in format (x0, y0, x1, y1)
|
|
54
|
-
Args:
|
|
55
|
-
yolox_box (np array [4]): Cached Bbox.
|
|
56
|
-
paddle_ocr_boxes (np array [n x 4]): PaddleOCR boxes
|
|
57
|
-
already_matched (list or None, Optional): Already matched ids to ignore.
|
|
58
|
-
delta (float, Optional): IoU delta for considering several boxes. Defaults to 2..
|
|
59
|
-
Returns:
|
|
60
|
-
np array or list: Indices of the match bboxes
|
|
61
|
-
"""
|
|
62
|
-
x0_1, y0_1, x1_1, y1_1 = yolox_box
|
|
63
|
-
x0_2, y0_2, x1_2, y1_2 = (
|
|
64
|
-
paddle_ocr_boxes[:, 0],
|
|
65
|
-
paddle_ocr_boxes[:, 1],
|
|
66
|
-
paddle_ocr_boxes[:, 2],
|
|
67
|
-
paddle_ocr_boxes[:, 3],
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
# Intersection
|
|
71
|
-
inter_y0 = np.maximum(y0_1, y0_2)
|
|
72
|
-
inter_y1 = np.minimum(y1_1, y1_2)
|
|
73
|
-
inter_x0 = np.maximum(x0_1, x0_2)
|
|
74
|
-
inter_x1 = np.minimum(x1_1, x1_2)
|
|
75
|
-
inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
|
|
76
|
-
|
|
77
|
-
# Union
|
|
78
|
-
area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
|
|
79
|
-
area_2 = (y1_2 - y0_2) * (x1_2 - x0_2)
|
|
80
|
-
union_area = area_1 + area_2 - inter_area
|
|
81
|
-
|
|
82
|
-
# IoU
|
|
83
|
-
ious = inter_area / union_area
|
|
84
|
-
|
|
85
|
-
max_iou = np.max(ious)
|
|
86
|
-
if max_iou <= 0.01:
|
|
87
|
-
return []
|
|
88
|
-
|
|
89
|
-
matches = np.where(ious > (max_iou / delta))[0]
|
|
90
|
-
if already_matched is not None:
|
|
91
|
-
matches = np.array([m for m in matches if m not in already_matched])
|
|
92
|
-
return matches
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, paddle_txts):
|
|
96
|
-
"""
|
|
97
|
-
Matching boxes
|
|
98
|
-
We need to associate a text to the paddle detections.
|
|
99
|
-
For each class and for each CACHED detections, we look for overlapping text bboxes
|
|
100
|
-
with IoU > max_iou / delta where max_iou is the biggest found overlap.
|
|
101
|
-
Found texts are added to the class representation, and removed from the texts to match
|
|
102
|
-
"""
|
|
103
|
-
KEPT_CLASSES = [ # Used CACHED classes, corresponds to YoloX classes
|
|
104
|
-
"chart_title",
|
|
105
|
-
"x_title",
|
|
106
|
-
"y_title",
|
|
107
|
-
"xlabel",
|
|
108
|
-
"ylabel",
|
|
109
|
-
"other",
|
|
110
|
-
"legend_label",
|
|
111
|
-
"legend_title",
|
|
112
|
-
"mark_label",
|
|
113
|
-
"value_label",
|
|
114
|
-
]
|
|
115
|
-
|
|
116
|
-
paddle_txts = np.array(paddle_txts)
|
|
117
|
-
paddle_boxes = np.array(paddle_boxes)
|
|
118
|
-
|
|
119
|
-
if (paddle_txts.size == 0) or (paddle_boxes.size == 0):
|
|
120
|
-
return {}
|
|
121
|
-
|
|
122
|
-
paddle_boxes = np.array(
|
|
123
|
-
[
|
|
124
|
-
paddle_boxes[:, :, 0].min(-1),
|
|
125
|
-
paddle_boxes[:, :, 1].min(-1),
|
|
126
|
-
paddle_boxes[:, :, 0].max(-1),
|
|
127
|
-
paddle_boxes[:, :, 1].max(-1),
|
|
128
|
-
]
|
|
129
|
-
).T
|
|
130
|
-
|
|
131
|
-
already_matched = []
|
|
132
|
-
results = {}
|
|
133
|
-
|
|
134
|
-
for k in KEPT_CLASSES:
|
|
135
|
-
if not len(yolox_output.get(k, [])): # No bounding boxes
|
|
136
|
-
continue
|
|
137
|
-
|
|
138
|
-
texts = []
|
|
139
|
-
for yolox_box in yolox_output[k]:
|
|
140
|
-
# if there's a score at the end, drop the score.
|
|
141
|
-
yolox_box = yolox_box[:4]
|
|
142
|
-
paddle_ids = match_bboxes(yolox_box, paddle_boxes, already_matched=already_matched, delta=4)
|
|
143
|
-
|
|
144
|
-
if len(paddle_ids) > 0:
|
|
145
|
-
text = " ".join(paddle_txts[paddle_ids].tolist())
|
|
146
|
-
texts.append(text)
|
|
147
|
-
|
|
148
|
-
processed_texts = []
|
|
149
|
-
for t in texts:
|
|
150
|
-
t = re.sub(r"\s+", " ", t)
|
|
151
|
-
t = re.sub(r"\.+", ".", t)
|
|
152
|
-
processed_texts.append(t)
|
|
153
|
-
|
|
154
|
-
if "title" in k:
|
|
155
|
-
processed_texts = " ".join(processed_texts)
|
|
156
|
-
else:
|
|
157
|
-
processed_texts = " - ".join(processed_texts) # Space ?
|
|
158
|
-
|
|
159
|
-
results[k] = processed_texts
|
|
160
|
-
|
|
161
|
-
return results
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
def convert_paddle_response_to_psuedo_markdown(bboxes, texts):
|
|
165
|
-
if (not bboxes) or (not texts):
|
|
166
|
-
return ""
|
|
167
|
-
|
|
168
|
-
bboxes = np.array(bboxes).astype(int)
|
|
169
|
-
bboxes = bboxes.reshape(-1, 8)[:, [0, 1, 2, -1]]
|
|
170
|
-
|
|
171
|
-
preds_df = pd.DataFrame(
|
|
172
|
-
{"x0": bboxes[:, 0], "y0": bboxes[:, 1], "x1": bboxes[:, 2], "y1": bboxes[:, 3], "text": texts}
|
|
173
|
-
)
|
|
174
|
-
preds_df = preds_df.sort_values("y0")
|
|
175
|
-
|
|
176
|
-
dbscan = DBSCAN(eps=10, min_samples=1)
|
|
177
|
-
dbscan.fit(preds_df["y0"].values[:, None])
|
|
178
|
-
|
|
179
|
-
preds_df["cluster"] = dbscan.labels_
|
|
180
|
-
preds_df = preds_df.sort_values(["cluster", "x0"])
|
|
181
|
-
|
|
182
|
-
results = ""
|
|
183
|
-
for _, dfg in preds_df.groupby("cluster"):
|
|
184
|
-
results += "| " + " | ".join(dfg["text"].values.tolist()) + " |\n"
|
|
185
|
-
|
|
186
|
-
return results
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_boxes, paddle_ocr_txts):
|
|
190
|
-
if (not paddle_ocr_boxes) or (not paddle_ocr_txts):
|
|
191
|
-
return ""
|
|
192
|
-
|
|
193
|
-
paddle_ocr_boxes = np.array(paddle_ocr_boxes)
|
|
194
|
-
paddle_ocr_boxes_ = np.array(
|
|
195
|
-
[
|
|
196
|
-
paddle_ocr_boxes[:, :, 0].min(-1),
|
|
197
|
-
paddle_ocr_boxes[:, :, 1].min(-1),
|
|
198
|
-
paddle_ocr_boxes[:, :, 0].max(-1),
|
|
199
|
-
paddle_ocr_boxes[:, :, 1].max(-1),
|
|
200
|
-
]
|
|
201
|
-
).T
|
|
202
|
-
|
|
203
|
-
assignments = []
|
|
204
|
-
for i, (b, t) in enumerate(zip(paddle_ocr_boxes_, paddle_ocr_txts)):
|
|
205
|
-
# Find a cell
|
|
206
|
-
matches_cell = assign_boxes(b, yolox_cell_preds["cell"], delta=1)
|
|
207
|
-
cell = yolox_cell_preds["cell"][matches_cell[0]] if len(matches_cell) else b
|
|
208
|
-
|
|
209
|
-
# Find a row
|
|
210
|
-
matches_row = assign_boxes(cell, yolox_cell_preds["row"], delta=1)
|
|
211
|
-
row_ids = matches_row if len(matches_row) else -1
|
|
212
|
-
|
|
213
|
-
# Find a column - or more if if it is the first row
|
|
214
|
-
if isinstance(row_ids, np.ndarray):
|
|
215
|
-
delta = 2 if row_ids.min() == 0 else 1 # delta=2 if header column
|
|
216
|
-
else:
|
|
217
|
-
delta = 1
|
|
218
|
-
matches_col = assign_boxes(cell, yolox_cell_preds["column"], delta=delta)
|
|
219
|
-
col_ids = matches_col if len(matches_col) else -1
|
|
220
|
-
|
|
221
|
-
assignments.append(
|
|
222
|
-
{
|
|
223
|
-
"index": i,
|
|
224
|
-
"paddle_box": b,
|
|
225
|
-
"is_table": isinstance(col_ids, np.ndarray) and isinstance(row_ids, np.ndarray),
|
|
226
|
-
"cell_id": matches_cell[0] if len(matches_cell) else -1,
|
|
227
|
-
"cell": cell,
|
|
228
|
-
"col_ids": col_ids,
|
|
229
|
-
"row_ids": row_ids,
|
|
230
|
-
"text": t,
|
|
231
|
-
}
|
|
232
|
-
)
|
|
233
|
-
# break
|
|
234
|
-
df_assign = pd.DataFrame(assignments)
|
|
235
|
-
|
|
236
|
-
# Merge cells with several assigned texts
|
|
237
|
-
dfs = []
|
|
238
|
-
for cell_id, df_cell in df_assign.groupby("cell_id"):
|
|
239
|
-
if len(df_cell) > 1 and cell_id > -1:
|
|
240
|
-
df_cell = merge_text_in_cell(df_cell)
|
|
241
|
-
dfs.append(df_cell)
|
|
242
|
-
df_assign = pd.concat(dfs)
|
|
243
|
-
|
|
244
|
-
df_text = df_assign[~df_assign["is_table"]].reset_index(drop=True)
|
|
245
|
-
|
|
246
|
-
# Table to text
|
|
247
|
-
df_table = df_assign[df_assign["is_table"]].reset_index(drop=True)
|
|
248
|
-
if len(df_table):
|
|
249
|
-
mat = build_markdown(df_table)
|
|
250
|
-
markdown_table = display_markdown(mat, use_header=False)
|
|
251
|
-
|
|
252
|
-
all_boxes = np.stack(df_table.paddle_box.values)
|
|
253
|
-
table_box = np.concatenate([all_boxes[:, [0, 1]].min(0), all_boxes[:, [2, 3]].max(0)])
|
|
254
|
-
|
|
255
|
-
df_table_to_text = pd.DataFrame(
|
|
256
|
-
[
|
|
257
|
-
{
|
|
258
|
-
"paddle_box": table_box,
|
|
259
|
-
"text": markdown_table,
|
|
260
|
-
"is_table": True,
|
|
261
|
-
}
|
|
262
|
-
]
|
|
263
|
-
)
|
|
264
|
-
# Final text representations dataframe
|
|
265
|
-
df_text = pd.concat([df_text, df_table_to_text], ignore_index=True)
|
|
266
|
-
|
|
267
|
-
df_text = df_text.rename(columns={"paddle_box": "box"})
|
|
268
|
-
|
|
269
|
-
# Sort by y and x
|
|
270
|
-
df_text["x"] = df_text["box"].apply(lambda x: (x[0] + x[2]) / 2)
|
|
271
|
-
df_text["y"] = df_text["box"].apply(lambda x: (x[1] + x[3]) / 2)
|
|
272
|
-
df_text["x"] = (df_text["x"] - df_text["x"].min()) // 10
|
|
273
|
-
df_text["y"] = (df_text["y"] - df_text["y"].min()) // 20
|
|
274
|
-
df_text = df_text.sort_values(["y", "x"], ignore_index=True)
|
|
275
|
-
|
|
276
|
-
# Loop over lines
|
|
277
|
-
rows_list = []
|
|
278
|
-
for r, df_row in df_text.groupby("y"):
|
|
279
|
-
if df_row["is_table"].values.any(): # Add table
|
|
280
|
-
table = df_row[df_row["is_table"]]
|
|
281
|
-
df_row = df_row[~df_row["is_table"]]
|
|
282
|
-
else:
|
|
283
|
-
table = None
|
|
284
|
-
|
|
285
|
-
if len(df_row) > 1: # Add text
|
|
286
|
-
df_row = df_row.reset_index(drop=True)
|
|
287
|
-
df_row["text"] = "\n".join(df_row["text"].values.tolist())
|
|
288
|
-
|
|
289
|
-
rows_list.append(df_row.head(1))
|
|
290
|
-
|
|
291
|
-
if table is not None:
|
|
292
|
-
rows_list.append(table)
|
|
293
|
-
|
|
294
|
-
df_display = pd.concat(rows_list, ignore_index=True)
|
|
295
|
-
result = "\n".join(df_display.text.values.tolist())
|
|
296
|
-
|
|
297
|
-
return result
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
|
|
301
|
-
"""
|
|
302
|
-
Assigns the closest bounding boxes to a reference `paddle_box` based on overlap.
|
|
303
|
-
|
|
304
|
-
Args:
|
|
305
|
-
paddle_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
|
|
306
|
-
boxes (numpy.ndarray): Array of candidate bounding boxes with shape (N, 4).
|
|
307
|
-
delta (float, optional): Factor for matches relative to the best overlap. Defaults to 2.0.
|
|
308
|
-
min_overlap (float, optional): Minimum required overlap for a match. Defaults to 0.25.
|
|
309
|
-
|
|
310
|
-
Returns:
|
|
311
|
-
list: Indices of the matched boxes sorted by decreasing overlap.
|
|
312
|
-
Returns an empty list if no matches are found.
|
|
313
|
-
"""
|
|
314
|
-
if not len(boxes):
|
|
315
|
-
return []
|
|
316
|
-
|
|
317
|
-
boxes = np.array(boxes)
|
|
318
|
-
|
|
319
|
-
x0_1, y0_1, x1_1, y1_1 = paddle_box
|
|
320
|
-
x0_2, y0_2, x1_2, y1_2 = (
|
|
321
|
-
boxes[:, 0],
|
|
322
|
-
boxes[:, 1],
|
|
323
|
-
boxes[:, 2],
|
|
324
|
-
boxes[:, 3],
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
# Intersection
|
|
328
|
-
inter_y0 = np.maximum(y0_1, y0_2)
|
|
329
|
-
inter_y1 = np.minimum(y1_1, y1_2)
|
|
330
|
-
inter_x0 = np.maximum(x0_1, x0_2)
|
|
331
|
-
inter_x1 = np.minimum(x1_1, x1_2)
|
|
332
|
-
inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
|
|
333
|
-
|
|
334
|
-
# Normalize by paddle_box size
|
|
335
|
-
area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
|
|
336
|
-
ious = inter_area / (area_1 + 1e-6)
|
|
337
|
-
|
|
338
|
-
max_iou = np.max(ious)
|
|
339
|
-
if max_iou <= min_overlap: # No match
|
|
340
|
-
return []
|
|
341
|
-
|
|
342
|
-
n = len(np.where(ious >= (max_iou / delta))[0])
|
|
343
|
-
matches = np.argsort(-ious)[:n]
|
|
344
|
-
return matches
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
def build_markdown(df):
|
|
348
|
-
"""
|
|
349
|
-
Convert a dataframe into a markdown table.
|
|
350
|
-
|
|
351
|
-
Args:
|
|
352
|
-
df (pandas DataFrame): The dataframe to convert.
|
|
353
|
-
|
|
354
|
-
Returns:
|
|
355
|
-
list[list]: A list of lists representing the markdown table.
|
|
356
|
-
"""
|
|
357
|
-
df = df.reset_index(drop=True)
|
|
358
|
-
n_cols = max([np.max(c) for c in df["col_ids"].values])
|
|
359
|
-
n_rows = max([np.max(c) for c in df["row_ids"].values])
|
|
360
|
-
|
|
361
|
-
mat = np.empty((n_rows + 1, n_cols + 1), dtype=str).tolist()
|
|
362
|
-
|
|
363
|
-
for i in range(len(df)):
|
|
364
|
-
if isinstance(df["row_ids"][i], int) or isinstance(df["col_ids"][i], int):
|
|
365
|
-
continue
|
|
366
|
-
for r in df["row_ids"][i]:
|
|
367
|
-
for c in df["col_ids"][i]:
|
|
368
|
-
mat[r][c] = (mat[r][c] + " " + df["text"][i]).strip()
|
|
369
|
-
|
|
370
|
-
# Remove empty rows & columns
|
|
371
|
-
mat = remove_empty_row(mat)
|
|
372
|
-
mat = np.array(remove_empty_row(np.array(mat).T.tolist())).T.tolist()
|
|
373
|
-
|
|
374
|
-
return mat
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
def merge_text_in_cell(df_cell):
|
|
378
|
-
"""
|
|
379
|
-
Merges text from multiple rows into a single cell and recalculates its bounding box.
|
|
380
|
-
Values are sorted by rounded (y, x) coordinates.
|
|
381
|
-
|
|
382
|
-
Args:
|
|
383
|
-
df_cell (pandas.DataFrame): DataFrame containing cells to merge.
|
|
384
|
-
|
|
385
|
-
Returns:
|
|
386
|
-
pandas.DataFrame: Updated DataFrame with merged text and a single bounding box.
|
|
387
|
-
"""
|
|
388
|
-
paddle_boxes = np.stack(df_cell["paddle_box"].values)
|
|
389
|
-
|
|
390
|
-
df_cell["x"] = (paddle_boxes[:, 0] - paddle_boxes[:, 0].min()) // 10
|
|
391
|
-
df_cell["y"] = (paddle_boxes[:, 1] - paddle_boxes[:, 1].min()) // 10
|
|
392
|
-
df_cell = df_cell.sort_values(["y", "x"])
|
|
393
|
-
|
|
394
|
-
text = " ".join(df_cell["text"].values.tolist())
|
|
395
|
-
df_cell["text"] = text
|
|
396
|
-
df_cell = df_cell.head(1)
|
|
397
|
-
df_cell["paddle_box"] = df_cell["cell"]
|
|
398
|
-
df_cell.drop(["x", "y"], axis=1, inplace=True)
|
|
399
|
-
|
|
400
|
-
return df_cell
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
def remove_empty_row(mat):
|
|
404
|
-
"""
|
|
405
|
-
Remove empty rows from a matrix.
|
|
406
|
-
|
|
407
|
-
Args:
|
|
408
|
-
mat (list[list]): The matrix to remove empty rows from.
|
|
409
|
-
|
|
410
|
-
Returns:
|
|
411
|
-
list[list]: The matrix with empty rows removed.
|
|
412
|
-
"""
|
|
413
|
-
mat_filter = []
|
|
414
|
-
for row in mat:
|
|
415
|
-
if max([len(c) for c in row]):
|
|
416
|
-
mat_filter.append(row)
|
|
417
|
-
return mat_filter
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
def display_markdown(
|
|
421
|
-
data: list[list[str]],
|
|
422
|
-
use_header: bool = False,
|
|
423
|
-
) -> str:
|
|
424
|
-
"""
|
|
425
|
-
Convert a list of lists of strings into a markdown table.
|
|
426
|
-
|
|
427
|
-
Parameters:
|
|
428
|
-
data (list[list[str]]): The table data. The first sublist should contain headers.
|
|
429
|
-
use_header (bool, optional): Whether to use the first sublist as headers. Defaults to True.
|
|
430
|
-
|
|
431
|
-
Returns:
|
|
432
|
-
str: A markdown-formatted table as a string.
|
|
433
|
-
"""
|
|
434
|
-
if not len(data):
|
|
435
|
-
return "EMPTY TABLE"
|
|
436
|
-
|
|
437
|
-
max_cols = max(len(row) for row in data)
|
|
438
|
-
data = [row + [""] * (max_cols - len(row)) for row in data]
|
|
439
|
-
|
|
440
|
-
if use_header:
|
|
441
|
-
header = "| " + " | ".join(data[0]) + " |"
|
|
442
|
-
separator = "| " + " | ".join(["---"] * max_cols) + " |"
|
|
443
|
-
body = "\n".join("| " + " | ".join(row) + " |" for row in data[1:])
|
|
444
|
-
markdown_table = f"{header}\n{separator}\n{body}" if body else f"{header}\n{separator}"
|
|
445
|
-
|
|
446
|
-
else:
|
|
447
|
-
markdown_table = "\n".join("| " + " | ".join(row) + " |" for row in data)
|
|
448
|
-
|
|
449
|
-
return markdown_table
|