nv-ingest-api 26.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/__init__.py +3 -0
- nv_ingest_api/interface/__init__.py +218 -0
- nv_ingest_api/interface/extract.py +977 -0
- nv_ingest_api/interface/mutate.py +154 -0
- nv_ingest_api/interface/store.py +200 -0
- nv_ingest_api/interface/transform.py +382 -0
- nv_ingest_api/interface/utility.py +186 -0
- nv_ingest_api/internal/__init__.py +0 -0
- nv_ingest_api/internal/enums/__init__.py +3 -0
- nv_ingest_api/internal/enums/common.py +550 -0
- nv_ingest_api/internal/extract/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/__init__.py +3 -0
- nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
- nv_ingest_api/internal/extract/docx/__init__.py +5 -0
- nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
- nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
- nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
- nv_ingest_api/internal/extract/html/__init__.py +3 -0
- nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
- nv_ingest_api/internal/extract/image/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
- nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
- nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
- nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
- nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
- nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
- nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
- nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
- nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
- nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
- nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
- nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
- nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
- nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
- nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
- nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
- nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
- nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
- nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
- nv_ingest_api/internal/meta/__init__.py +3 -0
- nv_ingest_api/internal/meta/udf.py +232 -0
- nv_ingest_api/internal/mutate/__init__.py +3 -0
- nv_ingest_api/internal/mutate/deduplicate.py +110 -0
- nv_ingest_api/internal/mutate/filter.py +133 -0
- nv_ingest_api/internal/primitives/__init__.py +0 -0
- nv_ingest_api/internal/primitives/control_message_task.py +16 -0
- nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
- nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
- nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
- nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
- nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
- nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
- nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
- nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
- nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
- nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
- nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
- nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
- nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
- nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
- nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
- nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
- nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
- nv_ingest_api/internal/schemas/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
- nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
- nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
- nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
- nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
- nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
- nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
- nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
- nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
- nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
- nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
- nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
- nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
- nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
- nv_ingest_api/internal/schemas/meta/udf.py +23 -0
- nv_ingest_api/internal/schemas/mixins.py +39 -0
- nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
- nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
- nv_ingest_api/internal/schemas/store/__init__.py +3 -0
- nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
- nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
- nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
- nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
- nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
- nv_ingest_api/internal/store/__init__.py +3 -0
- nv_ingest_api/internal/store/embed_text_upload.py +236 -0
- nv_ingest_api/internal/store/image_upload.py +251 -0
- nv_ingest_api/internal/transform/__init__.py +3 -0
- nv_ingest_api/internal/transform/caption_image.py +219 -0
- nv_ingest_api/internal/transform/embed_text.py +702 -0
- nv_ingest_api/internal/transform/split_text.py +182 -0
- nv_ingest_api/util/__init__.py +3 -0
- nv_ingest_api/util/control_message/__init__.py +0 -0
- nv_ingest_api/util/control_message/validators.py +47 -0
- nv_ingest_api/util/converters/__init__.py +0 -0
- nv_ingest_api/util/converters/bytetools.py +78 -0
- nv_ingest_api/util/converters/containers.py +65 -0
- nv_ingest_api/util/converters/datetools.py +90 -0
- nv_ingest_api/util/converters/dftools.py +127 -0
- nv_ingest_api/util/converters/formats.py +64 -0
- nv_ingest_api/util/converters/type_mappings.py +27 -0
- nv_ingest_api/util/dataloader/__init__.py +9 -0
- nv_ingest_api/util/dataloader/dataloader.py +409 -0
- nv_ingest_api/util/detectors/__init__.py +5 -0
- nv_ingest_api/util/detectors/language.py +38 -0
- nv_ingest_api/util/exception_handlers/__init__.py +0 -0
- nv_ingest_api/util/exception_handlers/converters.py +72 -0
- nv_ingest_api/util/exception_handlers/decorators.py +429 -0
- nv_ingest_api/util/exception_handlers/detectors.py +74 -0
- nv_ingest_api/util/exception_handlers/pdf.py +116 -0
- nv_ingest_api/util/exception_handlers/schemas.py +68 -0
- nv_ingest_api/util/image_processing/__init__.py +5 -0
- nv_ingest_api/util/image_processing/clustering.py +260 -0
- nv_ingest_api/util/image_processing/processing.py +177 -0
- nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
- nv_ingest_api/util/image_processing/transforms.py +850 -0
- nv_ingest_api/util/imports/__init__.py +3 -0
- nv_ingest_api/util/imports/callable_signatures.py +108 -0
- nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
- nv_ingest_api/util/introspection/__init__.py +3 -0
- nv_ingest_api/util/introspection/class_inspect.py +145 -0
- nv_ingest_api/util/introspection/function_inspect.py +65 -0
- nv_ingest_api/util/logging/__init__.py +0 -0
- nv_ingest_api/util/logging/configuration.py +102 -0
- nv_ingest_api/util/logging/sanitize.py +84 -0
- nv_ingest_api/util/message_brokers/__init__.py +3 -0
- nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
- nv_ingest_api/util/metadata/__init__.py +5 -0
- nv_ingest_api/util/metadata/aggregators.py +516 -0
- nv_ingest_api/util/multi_processing/__init__.py +8 -0
- nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
- nv_ingest_api/util/nim/__init__.py +161 -0
- nv_ingest_api/util/pdf/__init__.py +3 -0
- nv_ingest_api/util/pdf/pdfium.py +428 -0
- nv_ingest_api/util/schema/__init__.py +3 -0
- nv_ingest_api/util/schema/schema_validator.py +10 -0
- nv_ingest_api/util/service_clients/__init__.py +3 -0
- nv_ingest_api/util/service_clients/client_base.py +86 -0
- nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
- nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
- nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
- nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
- nv_ingest_api/util/string_processing/__init__.py +51 -0
- nv_ingest_api/util/string_processing/configuration.py +682 -0
- nv_ingest_api/util/string_processing/yaml.py +109 -0
- nv_ingest_api/util/system/__init__.py +0 -0
- nv_ingest_api/util/system/hardware_info.py +594 -0
- nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
- nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
- nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
- nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
- nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
- udfs/__init__.py +5 -0
- udfs/llm_summarizer_udf.py +259 -0
|
@@ -0,0 +1,504 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from sklearn.cluster import DBSCAN
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def process_yolox_graphic_elements(yolox_text_dict):
|
|
18
|
+
"""
|
|
19
|
+
Process the inference results from yolox-graphic-elements model.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
yolox_text : str
|
|
24
|
+
The result from the yolox model inference.
|
|
25
|
+
|
|
26
|
+
Returns
|
|
27
|
+
-------
|
|
28
|
+
str
|
|
29
|
+
The concatenated and processed chart content as a string.
|
|
30
|
+
"""
|
|
31
|
+
chart_content = ""
|
|
32
|
+
|
|
33
|
+
chart_content += yolox_text_dict.get("chart_title", "")
|
|
34
|
+
|
|
35
|
+
chart_content += " " + yolox_text_dict.get("caption", "")
|
|
36
|
+
chart_content += " " + yolox_text_dict.get("x_title", "")
|
|
37
|
+
chart_content += " " + yolox_text_dict.get("xlabel", "")
|
|
38
|
+
chart_content += " " + yolox_text_dict.get("y_title", "")
|
|
39
|
+
chart_content += " " + yolox_text_dict.get("ylabel", "")
|
|
40
|
+
chart_content += " " + yolox_text_dict.get("legend_label", "")
|
|
41
|
+
chart_content += " " + yolox_text_dict.get("legend_title", "")
|
|
42
|
+
chart_content += " " + yolox_text_dict.get("mark_label", "")
|
|
43
|
+
chart_content += " " + yolox_text_dict.get("value_label", "")
|
|
44
|
+
chart_content += " " + yolox_text_dict.get("other", "")
|
|
45
|
+
|
|
46
|
+
return chart_content.strip()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def match_bboxes(yolox_box, ocr_boxes, already_matched=None, delta=2.0):
|
|
50
|
+
"""
|
|
51
|
+
Associates a yolox-graphic-elements box to PaddleOCR bboxes, by taking overlapping boxes.
|
|
52
|
+
Criterion is iou > max_iou / delta where max_iou is the biggest found overlap.
|
|
53
|
+
Boxes are expeceted in format (x0, y0, x1, y1)
|
|
54
|
+
Args:
|
|
55
|
+
yolox_box (np array [4]): Cached Bbox.
|
|
56
|
+
ocr_boxes (np array [n x 4]): PaddleOCR boxes
|
|
57
|
+
already_matched (list or None, Optional): Already matched ids to ignore.
|
|
58
|
+
delta (float, Optional): IoU delta for considering several boxes. Defaults to 2..
|
|
59
|
+
Returns:
|
|
60
|
+
np array or list: Indices of the match bboxes
|
|
61
|
+
"""
|
|
62
|
+
x0_1, y0_1, x1_1, y1_1 = yolox_box
|
|
63
|
+
x0_2, y0_2, x1_2, y1_2 = (
|
|
64
|
+
ocr_boxes[:, 0],
|
|
65
|
+
ocr_boxes[:, 1],
|
|
66
|
+
ocr_boxes[:, 2],
|
|
67
|
+
ocr_boxes[:, 3],
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Intersection
|
|
71
|
+
inter_y0 = np.maximum(y0_1, y0_2)
|
|
72
|
+
inter_y1 = np.minimum(y1_1, y1_2)
|
|
73
|
+
inter_x0 = np.maximum(x0_1, x0_2)
|
|
74
|
+
inter_x1 = np.minimum(x1_1, x1_2)
|
|
75
|
+
inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
|
|
76
|
+
|
|
77
|
+
# Union
|
|
78
|
+
area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
|
|
79
|
+
area_2 = (y1_2 - y0_2) * (x1_2 - x0_2)
|
|
80
|
+
union_area = area_1 + area_2 - inter_area
|
|
81
|
+
|
|
82
|
+
# IoU
|
|
83
|
+
ious = inter_area / union_area
|
|
84
|
+
|
|
85
|
+
max_iou = np.max(ious)
|
|
86
|
+
if max_iou <= 0.01:
|
|
87
|
+
return []
|
|
88
|
+
|
|
89
|
+
matches = np.where(ious > (max_iou / delta))[0]
|
|
90
|
+
if already_matched is not None:
|
|
91
|
+
matches = np.array([m for m in matches if m not in already_matched])
|
|
92
|
+
return matches
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def join_yolox_graphic_elements_and_ocr_output(yolox_output, ocr_boxes, ocr_txts):
|
|
96
|
+
"""
|
|
97
|
+
Matching boxes
|
|
98
|
+
We need to associate a text to the ocr detections.
|
|
99
|
+
For each class and for each CACHED detections, we look for overlapping text bboxes
|
|
100
|
+
with IoU > max_iou / delta where max_iou is the biggest found overlap.
|
|
101
|
+
Found texts are added to the class representation, and removed from the texts to match
|
|
102
|
+
"""
|
|
103
|
+
KEPT_CLASSES = [ # Used CACHED classes, corresponds to YoloX classes
|
|
104
|
+
"chart_title",
|
|
105
|
+
"x_title",
|
|
106
|
+
"y_title",
|
|
107
|
+
"xlabel",
|
|
108
|
+
"ylabel",
|
|
109
|
+
"other",
|
|
110
|
+
"legend_label",
|
|
111
|
+
"legend_title",
|
|
112
|
+
"mark_label",
|
|
113
|
+
"value_label",
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
ocr_txts = np.array(ocr_txts)
|
|
117
|
+
ocr_boxes = np.array(ocr_boxes)
|
|
118
|
+
|
|
119
|
+
if (ocr_txts.size == 0) or (ocr_boxes.size == 0):
|
|
120
|
+
return {}
|
|
121
|
+
|
|
122
|
+
ocr_boxes = np.array(
|
|
123
|
+
[
|
|
124
|
+
ocr_boxes[:, :, 0].min(-1),
|
|
125
|
+
ocr_boxes[:, :, 1].min(-1),
|
|
126
|
+
ocr_boxes[:, :, 0].max(-1),
|
|
127
|
+
ocr_boxes[:, :, 1].max(-1),
|
|
128
|
+
]
|
|
129
|
+
).T
|
|
130
|
+
|
|
131
|
+
already_matched = []
|
|
132
|
+
results = {}
|
|
133
|
+
|
|
134
|
+
for k in KEPT_CLASSES:
|
|
135
|
+
if not len(yolox_output.get(k, [])): # No bounding boxes
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
texts = []
|
|
139
|
+
for yolox_box in yolox_output[k]:
|
|
140
|
+
# if there's a score at the end, drop the score.
|
|
141
|
+
yolox_box = yolox_box[:4]
|
|
142
|
+
ocr_ids = match_bboxes(yolox_box, ocr_boxes, already_matched=already_matched, delta=4)
|
|
143
|
+
|
|
144
|
+
if len(ocr_ids) > 0:
|
|
145
|
+
text = " ".join(ocr_txts[ocr_ids].tolist())
|
|
146
|
+
texts.append(text)
|
|
147
|
+
|
|
148
|
+
processed_texts = []
|
|
149
|
+
for t in texts:
|
|
150
|
+
t = re.sub(r"\s+", " ", t)
|
|
151
|
+
t = re.sub(r"\.+", ".", t)
|
|
152
|
+
processed_texts.append(t)
|
|
153
|
+
|
|
154
|
+
if "title" in k:
|
|
155
|
+
processed_texts = " ".join(processed_texts)
|
|
156
|
+
else:
|
|
157
|
+
processed_texts = " - ".join(processed_texts) # Space ?
|
|
158
|
+
|
|
159
|
+
results[k] = processed_texts
|
|
160
|
+
|
|
161
|
+
return results
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def convert_ocr_response_to_psuedo_markdown(bboxes, texts):
|
|
165
|
+
if (not bboxes) or (not texts):
|
|
166
|
+
return ""
|
|
167
|
+
|
|
168
|
+
bboxes = np.array(bboxes).astype(int)
|
|
169
|
+
bboxes = bboxes.reshape(-1, 8)[:, [0, 1, 2, -1]]
|
|
170
|
+
|
|
171
|
+
preds_df = pd.DataFrame(
|
|
172
|
+
{"x0": bboxes[:, 0], "y0": bboxes[:, 1], "x1": bboxes[:, 2], "y1": bboxes[:, 3], "text": texts}
|
|
173
|
+
)
|
|
174
|
+
preds_df = preds_df.sort_values("y0")
|
|
175
|
+
|
|
176
|
+
dbscan = DBSCAN(eps=10, min_samples=1)
|
|
177
|
+
dbscan.fit(preds_df["y0"].values[:, None])
|
|
178
|
+
|
|
179
|
+
preds_df["cluster"] = dbscan.labels_
|
|
180
|
+
preds_df = preds_df.sort_values(["cluster", "x0"])
|
|
181
|
+
|
|
182
|
+
results = ""
|
|
183
|
+
for _, dfg in preds_df.groupby("cluster"):
|
|
184
|
+
results += "| " + " | ".join(dfg["text"].values.tolist()) + " |\n"
|
|
185
|
+
|
|
186
|
+
return results
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def join_yolox_table_structure_and_ocr_output(yolox_cell_preds, ocr_boxes, ocr_txts):
|
|
190
|
+
if (not ocr_boxes) or (not ocr_txts):
|
|
191
|
+
return ""
|
|
192
|
+
|
|
193
|
+
ocr_boxes = np.array(ocr_boxes)
|
|
194
|
+
ocr_boxes_ = np.array(
|
|
195
|
+
[
|
|
196
|
+
ocr_boxes[:, :, 0].min(-1),
|
|
197
|
+
ocr_boxes[:, :, 1].min(-1),
|
|
198
|
+
ocr_boxes[:, :, 0].max(-1),
|
|
199
|
+
ocr_boxes[:, :, 1].max(-1),
|
|
200
|
+
]
|
|
201
|
+
).T
|
|
202
|
+
|
|
203
|
+
assignments = []
|
|
204
|
+
for i, (b, t) in enumerate(zip(ocr_boxes_, ocr_txts)):
|
|
205
|
+
# Find a cell
|
|
206
|
+
matches_cell = assign_boxes(b, yolox_cell_preds["cell"], delta=1)
|
|
207
|
+
cell = yolox_cell_preds["cell"][matches_cell[0]] if len(matches_cell) else b
|
|
208
|
+
|
|
209
|
+
# Find a row
|
|
210
|
+
matches_row = assign_boxes(cell, yolox_cell_preds["row"], delta=1)
|
|
211
|
+
row_ids = matches_row if len(matches_row) else -1
|
|
212
|
+
|
|
213
|
+
# Find a column - or more if if it is the first row
|
|
214
|
+
if isinstance(row_ids, np.ndarray):
|
|
215
|
+
delta = 2 if row_ids.min() == 0 else 1 # delta=2 if header column
|
|
216
|
+
else:
|
|
217
|
+
delta = 1
|
|
218
|
+
matches_col = assign_boxes(cell, yolox_cell_preds["column"], delta=delta)
|
|
219
|
+
col_ids = matches_col if len(matches_col) else -1
|
|
220
|
+
|
|
221
|
+
assignments.append(
|
|
222
|
+
{
|
|
223
|
+
"index": i,
|
|
224
|
+
"ocr_box": b,
|
|
225
|
+
"is_table": isinstance(col_ids, np.ndarray) and isinstance(row_ids, np.ndarray),
|
|
226
|
+
"cell_id": matches_cell[0] if len(matches_cell) else -1,
|
|
227
|
+
"cell": cell,
|
|
228
|
+
"col_ids": col_ids,
|
|
229
|
+
"row_ids": row_ids,
|
|
230
|
+
"text": t,
|
|
231
|
+
}
|
|
232
|
+
)
|
|
233
|
+
# break
|
|
234
|
+
df_assign = pd.DataFrame(assignments)
|
|
235
|
+
|
|
236
|
+
# Merge cells with several assigned texts
|
|
237
|
+
dfs = []
|
|
238
|
+
for cell_id, df_cell in df_assign.groupby("cell_id"):
|
|
239
|
+
if len(df_cell) > 1 and cell_id > -1:
|
|
240
|
+
df_cell = merge_text_in_cell(df_cell)
|
|
241
|
+
dfs.append(df_cell)
|
|
242
|
+
df_assign = pd.concat(dfs)
|
|
243
|
+
|
|
244
|
+
df_text = df_assign[~df_assign["is_table"]].reset_index(drop=True)
|
|
245
|
+
|
|
246
|
+
# Table to text
|
|
247
|
+
df_table = df_assign[df_assign["is_table"]].reset_index(drop=True)
|
|
248
|
+
if len(df_table):
|
|
249
|
+
mat = build_markdown(df_table)
|
|
250
|
+
markdown_table = display_markdown(mat, use_header=False)
|
|
251
|
+
|
|
252
|
+
all_boxes = np.stack(df_table.ocr_box.values)
|
|
253
|
+
table_box = np.concatenate([all_boxes[:, [0, 1]].min(0), all_boxes[:, [2, 3]].max(0)])
|
|
254
|
+
|
|
255
|
+
df_table_to_text = pd.DataFrame(
|
|
256
|
+
[
|
|
257
|
+
{
|
|
258
|
+
"ocr_box": table_box,
|
|
259
|
+
"text": markdown_table,
|
|
260
|
+
"is_table": True,
|
|
261
|
+
}
|
|
262
|
+
]
|
|
263
|
+
)
|
|
264
|
+
# Final text representations dataframe
|
|
265
|
+
df_text = pd.concat([df_text, df_table_to_text], ignore_index=True)
|
|
266
|
+
|
|
267
|
+
df_text = df_text.rename(columns={"ocr_box": "box"})
|
|
268
|
+
|
|
269
|
+
# Sort by y and x
|
|
270
|
+
df_text["x"] = df_text["box"].apply(lambda x: (x[0] + x[2]) / 2)
|
|
271
|
+
df_text["y"] = df_text["box"].apply(lambda x: (x[1] + x[3]) / 2)
|
|
272
|
+
df_text["x"] = (df_text["x"] - df_text["x"].min()) // 10
|
|
273
|
+
df_text["y"] = (df_text["y"] - df_text["y"].min()) // 20
|
|
274
|
+
df_text = df_text.sort_values(["y", "x"], ignore_index=True)
|
|
275
|
+
|
|
276
|
+
# Loop over lines
|
|
277
|
+
rows_list = []
|
|
278
|
+
for r, df_row in df_text.groupby("y"):
|
|
279
|
+
if df_row["is_table"].values.any(): # Add table
|
|
280
|
+
table = df_row[df_row["is_table"]]
|
|
281
|
+
df_row = df_row[~df_row["is_table"]]
|
|
282
|
+
else:
|
|
283
|
+
table = None
|
|
284
|
+
|
|
285
|
+
if len(df_row) > 1: # Add text
|
|
286
|
+
df_row = df_row.reset_index(drop=True)
|
|
287
|
+
df_row["text"] = "\n".join(df_row["text"].values.tolist())
|
|
288
|
+
|
|
289
|
+
rows_list.append(df_row.head(1))
|
|
290
|
+
|
|
291
|
+
if table is not None:
|
|
292
|
+
rows_list.append(table)
|
|
293
|
+
|
|
294
|
+
df_display = pd.concat(rows_list, ignore_index=True)
|
|
295
|
+
result = "\n".join(df_display.text.values.tolist())
|
|
296
|
+
|
|
297
|
+
return result
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def assign_boxes(ocr_box, boxes, delta=2.0, min_overlap=0.25):
|
|
301
|
+
"""
|
|
302
|
+
Assigns the closest bounding boxes to a reference `ocr_box` based on overlap.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
ocr_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
|
|
306
|
+
boxes (numpy.ndarray): Array of candidate bounding boxes with shape (N, 4).
|
|
307
|
+
delta (float, optional): Factor for matches relative to the best overlap. Defaults to 2.0.
|
|
308
|
+
min_overlap (float, optional): Minimum required overlap for a match. Defaults to 0.25.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
list: Indices of the matched boxes sorted by decreasing overlap.
|
|
312
|
+
Returns an empty list if no matches are found.
|
|
313
|
+
"""
|
|
314
|
+
if not len(boxes):
|
|
315
|
+
return []
|
|
316
|
+
|
|
317
|
+
boxes = np.array(boxes)
|
|
318
|
+
|
|
319
|
+
x0_1, y0_1, x1_1, y1_1 = ocr_box
|
|
320
|
+
x0_2, y0_2, x1_2, y1_2 = (
|
|
321
|
+
boxes[:, 0],
|
|
322
|
+
boxes[:, 1],
|
|
323
|
+
boxes[:, 2],
|
|
324
|
+
boxes[:, 3],
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
# Intersection
|
|
328
|
+
inter_y0 = np.maximum(y0_1, y0_2)
|
|
329
|
+
inter_y1 = np.minimum(y1_1, y1_2)
|
|
330
|
+
inter_x0 = np.maximum(x0_1, x0_2)
|
|
331
|
+
inter_x1 = np.minimum(x1_1, x1_2)
|
|
332
|
+
inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
|
|
333
|
+
|
|
334
|
+
# Normalize by ocr_box size
|
|
335
|
+
area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
|
|
336
|
+
ious = inter_area / (area_1 + 1e-6)
|
|
337
|
+
|
|
338
|
+
max_iou = np.max(ious)
|
|
339
|
+
if max_iou <= min_overlap: # No match
|
|
340
|
+
return []
|
|
341
|
+
|
|
342
|
+
n = len(np.where(ious >= (max_iou / delta))[0])
|
|
343
|
+
matches = np.argsort(-ious)[:n]
|
|
344
|
+
return matches
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def build_markdown(df):
|
|
348
|
+
"""
|
|
349
|
+
Convert a dataframe into a markdown table.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
df (pandas DataFrame): The dataframe to convert.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
list[list]: A list of lists representing the markdown table.
|
|
356
|
+
"""
|
|
357
|
+
df = df.reset_index(drop=True)
|
|
358
|
+
n_cols = max([np.max(c) for c in df["col_ids"].values])
|
|
359
|
+
n_rows = max([np.max(c) for c in df["row_ids"].values])
|
|
360
|
+
|
|
361
|
+
mat = np.empty((n_rows + 1, n_cols + 1), dtype=str).tolist()
|
|
362
|
+
|
|
363
|
+
for i in range(len(df)):
|
|
364
|
+
if isinstance(df["row_ids"][i], int) or isinstance(df["col_ids"][i], int):
|
|
365
|
+
continue
|
|
366
|
+
for r in df["row_ids"][i]:
|
|
367
|
+
for c in df["col_ids"][i]:
|
|
368
|
+
mat[r][c] = (mat[r][c] + " " + df["text"][i]).strip()
|
|
369
|
+
|
|
370
|
+
# Remove empty rows & columns
|
|
371
|
+
mat = remove_empty_row(mat)
|
|
372
|
+
mat = np.array(remove_empty_row(np.array(mat).T.tolist())).T.tolist()
|
|
373
|
+
|
|
374
|
+
return mat
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def merge_text_in_cell(df_cell):
|
|
378
|
+
"""
|
|
379
|
+
Merges text from multiple rows into a single cell and recalculates its bounding box.
|
|
380
|
+
Values are sorted by rounded (y, x) coordinates.
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
df_cell (pandas.DataFrame): DataFrame containing cells to merge.
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
pandas.DataFrame: Updated DataFrame with merged text and a single bounding box.
|
|
387
|
+
"""
|
|
388
|
+
ocr_boxes = np.stack(df_cell["ocr_box"].values)
|
|
389
|
+
|
|
390
|
+
df_cell["x"] = (ocr_boxes[:, 0] - ocr_boxes[:, 0].min()) // 10
|
|
391
|
+
df_cell["y"] = (ocr_boxes[:, 1] - ocr_boxes[:, 1].min()) // 10
|
|
392
|
+
df_cell = df_cell.sort_values(["y", "x"])
|
|
393
|
+
|
|
394
|
+
text = " ".join(df_cell["text"].values.tolist())
|
|
395
|
+
df_cell["text"] = text
|
|
396
|
+
df_cell = df_cell.head(1)
|
|
397
|
+
df_cell["ocr_box"] = df_cell["cell"]
|
|
398
|
+
df_cell.drop(["x", "y"], axis=1, inplace=True)
|
|
399
|
+
|
|
400
|
+
return df_cell
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def remove_empty_row(mat):
|
|
404
|
+
"""
|
|
405
|
+
Remove empty rows from a matrix.
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
mat (list[list]): The matrix to remove empty rows from.
|
|
409
|
+
|
|
410
|
+
Returns:
|
|
411
|
+
list[list]: The matrix with empty rows removed.
|
|
412
|
+
"""
|
|
413
|
+
mat_filter = []
|
|
414
|
+
for row in mat:
|
|
415
|
+
if max([len(c) for c in row]):
|
|
416
|
+
mat_filter.append(row)
|
|
417
|
+
return mat_filter
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def display_markdown(
|
|
421
|
+
data: list[list[str]],
|
|
422
|
+
use_header: bool = False,
|
|
423
|
+
) -> str:
|
|
424
|
+
"""
|
|
425
|
+
Convert a list of lists of strings into a markdown table.
|
|
426
|
+
|
|
427
|
+
Parameters:
|
|
428
|
+
data (list[list[str]]): The table data. The first sublist should contain headers.
|
|
429
|
+
use_header (bool, optional): Whether to use the first sublist as headers. Defaults to True.
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
str: A markdown-formatted table as a string.
|
|
433
|
+
"""
|
|
434
|
+
if not len(data):
|
|
435
|
+
return "EMPTY TABLE"
|
|
436
|
+
|
|
437
|
+
max_cols = max(len(row) for row in data)
|
|
438
|
+
data = [row + [""] * (max_cols - len(row)) for row in data]
|
|
439
|
+
|
|
440
|
+
if use_header:
|
|
441
|
+
header = "| " + " | ".join(data[0]) + " |"
|
|
442
|
+
separator = "| " + " | ".join(["---"] * max_cols) + " |"
|
|
443
|
+
body = "\n".join("| " + " | ".join(row) + " |" for row in data[1:])
|
|
444
|
+
markdown_table = f"{header}\n{separator}\n{body}" if body else f"{header}\n{separator}"
|
|
445
|
+
|
|
446
|
+
else:
|
|
447
|
+
markdown_table = "\n".join("| " + " | ".join(row) + " |" for row in data)
|
|
448
|
+
|
|
449
|
+
return markdown_table
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def reorder_boxes(boxes, texts, confs, mode="top_left", dbscan_eps=10):
|
|
453
|
+
"""
|
|
454
|
+
Reorders the boxes in reading order.
|
|
455
|
+
If mode is "center", the boxes are reordered using bbox center.
|
|
456
|
+
If mode is "top_left", the boxes are reordered using the top left corner.
|
|
457
|
+
If dbscan_eps is not 0, the boxes are reordered using DBSCAN clustering.
|
|
458
|
+
|
|
459
|
+
Args:
|
|
460
|
+
boxes (np array [n x 4 x 2]): The bounding boxes of the OCR results.
|
|
461
|
+
texts (np array [n]): The text of the OCR results.
|
|
462
|
+
confs (np array [n]): The confidence scores of the OCR results.
|
|
463
|
+
mode (str, optional): The mode to reorder the boxes. Defaults to "center".
|
|
464
|
+
dbscan_eps (float, optional): The epsilon parameter for DBSCAN. Defaults to 10.
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
List[List[int, ...]]: The reordered bounding boxes.
|
|
468
|
+
List[str]: The reordered texts.
|
|
469
|
+
List[float]: The reordered confidence scores.
|
|
470
|
+
"""
|
|
471
|
+
df = pd.DataFrame(
|
|
472
|
+
[[b, t, c] for b, t, c in zip(boxes, texts, confs)],
|
|
473
|
+
columns=["bbox", "text", "conf"],
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
if mode == "center":
|
|
477
|
+
df["x"] = df["bbox"].apply(lambda box: (box[0][0] + box[2][0]) / 2)
|
|
478
|
+
df["y"] = df["bbox"].apply(lambda box: (box[0][1] + box[2][1]) / 2)
|
|
479
|
+
elif mode == "top_left":
|
|
480
|
+
df["x"] = df["bbox"].apply(lambda box: (box[0][0]))
|
|
481
|
+
df["y"] = df["bbox"].apply(lambda box: (box[0][1]))
|
|
482
|
+
|
|
483
|
+
if dbscan_eps:
|
|
484
|
+
do_naive_sorting = False
|
|
485
|
+
try:
|
|
486
|
+
dbscan = DBSCAN(eps=dbscan_eps, min_samples=1)
|
|
487
|
+
dbscan.fit(df["y"].values[:, None])
|
|
488
|
+
df["cluster"] = dbscan.labels_
|
|
489
|
+
df["cluster_centers"] = df.groupby("cluster")["y"].transform("mean").astype(int)
|
|
490
|
+
df = df.sort_values(["cluster_centers", "x"], ascending=[True, True], ignore_index=True)
|
|
491
|
+
except ValueError:
|
|
492
|
+
do_naive_sorting = True
|
|
493
|
+
else:
|
|
494
|
+
do_naive_sorting = True
|
|
495
|
+
|
|
496
|
+
if do_naive_sorting:
|
|
497
|
+
df["y"] = np.round((df["y"] - df["y"].min()) // 5, 0)
|
|
498
|
+
df = df.sort_values(["y", "x"], ascending=[True, True], ignore_index=True)
|
|
499
|
+
|
|
500
|
+
bboxes = df["bbox"].values.tolist()
|
|
501
|
+
texts = df["text"].values.tolist()
|
|
502
|
+
confs = df["conf"].values.tolist()
|
|
503
|
+
|
|
504
|
+
return bboxes, texts, confs
|