nv-ingest-api 2025.7.15.dev20250715__py3-none-any.whl → 2025.7.17.dev20250717__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/interface/extract.py +18 -18
- nv_ingest_api/internal/enums/common.py +6 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +75 -55
- nv_ingest_api/internal/extract/image/infographic_extractor.py +59 -35
- nv_ingest_api/internal/extract/image/table_extractor.py +81 -63
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +7 -7
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +32 -20
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +32 -9
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +58 -0
- nv_ingest_api/internal/primitives/nim/model_interface/{paddle.py → ocr.py} +132 -39
- nv_ingest_api/internal/primitives/nim/nim_client.py +46 -11
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +5 -5
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +5 -0
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +4 -0
- nv_ingest_api/internal/transform/embed_text.py +103 -12
- nv_ingest_api/internal/transform/split_text.py +13 -8
- nv_ingest_api/util/image_processing/table_and_chart.py +97 -42
- nv_ingest_api/util/image_processing/transforms.py +19 -5
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +1 -1
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +51 -48
- nv_ingest_api/util/metadata/aggregators.py +4 -1
- {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/METADATA +1 -1
- {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/RECORD +28 -28
- {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/WHEEL +0 -0
- {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/top_level.txt +0 -0
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from functools import partial
|
|
7
8
|
from typing import Any, Dict, Tuple, Optional, Iterable, List
|
|
8
9
|
|
|
9
10
|
import pandas as pd
|
|
@@ -19,6 +20,9 @@ from nv_ingest_api.util.schema.schema_validator import validate_schema
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
21
22
|
|
|
23
|
+
MULTI_MODAL_MODELS = ["llama-3.2-nemoretriever-1b-vlm-embed-v1"]
|
|
24
|
+
|
|
25
|
+
|
|
22
26
|
# ------------------------------------------------------------------------------
|
|
23
27
|
# Asynchronous Embedding Requests
|
|
24
28
|
# ------------------------------------------------------------------------------
|
|
@@ -33,6 +37,7 @@ def _make_async_request(
|
|
|
33
37
|
input_type: str,
|
|
34
38
|
truncate: str,
|
|
35
39
|
filter_errors: bool,
|
|
40
|
+
modalities: Optional[List[str]] = None,
|
|
36
41
|
) -> list:
|
|
37
42
|
"""
|
|
38
43
|
Interacts directly with the NIM embedding service to calculate embeddings for a batch of prompts.
|
|
@@ -74,11 +79,18 @@ def _make_async_request(
|
|
|
74
79
|
base_url=embedding_nim_endpoint,
|
|
75
80
|
)
|
|
76
81
|
|
|
82
|
+
extra_body = {
|
|
83
|
+
"input_type": input_type,
|
|
84
|
+
"truncate": truncate,
|
|
85
|
+
}
|
|
86
|
+
if modalities:
|
|
87
|
+
extra_body["modality"] = modalities
|
|
88
|
+
|
|
77
89
|
resp = client.embeddings.create(
|
|
78
90
|
input=prompts,
|
|
79
91
|
model=embedding_model,
|
|
80
92
|
encoding_format=encoding_format,
|
|
81
|
-
extra_body=
|
|
93
|
+
extra_body=extra_body,
|
|
82
94
|
)
|
|
83
95
|
|
|
84
96
|
response["embedding"] = resp.data
|
|
@@ -110,6 +122,7 @@ def _async_request_handler(
|
|
|
110
122
|
input_type: str,
|
|
111
123
|
truncate: str,
|
|
112
124
|
filter_errors: bool,
|
|
125
|
+
modalities: Optional[List[str]] = None,
|
|
113
126
|
) -> List[dict]:
|
|
114
127
|
"""
|
|
115
128
|
Gathers calculated embedding results from the NIM embedding service concurrently.
|
|
@@ -138,6 +151,9 @@ def _async_request_handler(
|
|
|
138
151
|
List[dict]
|
|
139
152
|
A list of response dictionaries from the embedding service.
|
|
140
153
|
"""
|
|
154
|
+
if modalities is None:
|
|
155
|
+
modalities = [None] * len(prompts)
|
|
156
|
+
|
|
141
157
|
with ThreadPoolExecutor() as executor:
|
|
142
158
|
futures = [
|
|
143
159
|
executor.submit(
|
|
@@ -150,8 +166,9 @@ def _async_request_handler(
|
|
|
150
166
|
input_type=input_type,
|
|
151
167
|
truncate=truncate,
|
|
152
168
|
filter_errors=filter_errors,
|
|
169
|
+
modalities=modality_batch,
|
|
153
170
|
)
|
|
154
|
-
for prompt_batch in prompts
|
|
171
|
+
for prompt_batch, modality_batch in zip(prompts, modalities)
|
|
155
172
|
]
|
|
156
173
|
results = [future.result() for future in futures]
|
|
157
174
|
|
|
@@ -167,6 +184,7 @@ def _async_runner(
|
|
|
167
184
|
input_type: str,
|
|
168
185
|
truncate: str,
|
|
169
186
|
filter_errors: bool,
|
|
187
|
+
modalities: Optional[List[str]] = None,
|
|
170
188
|
) -> dict:
|
|
171
189
|
"""
|
|
172
190
|
Concurrently launches all NIM embedding requests and flattens the results.
|
|
@@ -204,6 +222,7 @@ def _async_runner(
|
|
|
204
222
|
input_type,
|
|
205
223
|
truncate,
|
|
206
224
|
filter_errors,
|
|
225
|
+
modalities=modalities,
|
|
207
226
|
)
|
|
208
227
|
|
|
209
228
|
flat_results = {"embeddings": [], "info_msgs": []}
|
|
@@ -263,7 +282,19 @@ def _add_embeddings(row, embeddings, info_msgs):
|
|
|
263
282
|
return row
|
|
264
283
|
|
|
265
284
|
|
|
266
|
-
def
|
|
285
|
+
def _format_image_input_string(image_b64: Optional[str]) -> str:
|
|
286
|
+
if not image_b64:
|
|
287
|
+
return
|
|
288
|
+
return f"data:image/png;base64,{image_b64}"
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _format_text_image_pair_input_string(text: Optional[str], image_b64: Optional[str]) -> str:
|
|
292
|
+
if (not text) or (not text.strip()) or (not image_b64):
|
|
293
|
+
return
|
|
294
|
+
return f"{text.strip()} {_format_image_input_string(image_b64)}"
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _get_pandas_text_content(row, modality="text"):
|
|
267
298
|
"""
|
|
268
299
|
Extracts text content from a DataFrame row.
|
|
269
300
|
|
|
@@ -280,7 +311,7 @@ def _get_pandas_text_content(row):
|
|
|
280
311
|
return row["content"]
|
|
281
312
|
|
|
282
313
|
|
|
283
|
-
def _get_pandas_table_content(row):
|
|
314
|
+
def _get_pandas_table_content(row, modality="text"):
|
|
284
315
|
"""
|
|
285
316
|
Extracts table/chart content from a DataFrame row.
|
|
286
317
|
|
|
@@ -294,10 +325,19 @@ def _get_pandas_table_content(row):
|
|
|
294
325
|
str
|
|
295
326
|
The table/chart content from the row.
|
|
296
327
|
"""
|
|
297
|
-
|
|
328
|
+
if modality == "text":
|
|
329
|
+
content = row.get("table_metadata", {}).get("table_content")
|
|
330
|
+
elif modality == "image":
|
|
331
|
+
content = _format_image_input_string(row.get("content"))
|
|
332
|
+
elif modality == "text_image":
|
|
333
|
+
text = row.get("table_metadata", {}).get("table_content")
|
|
334
|
+
image = row.get("content")
|
|
335
|
+
content = _format_text_image_pair_input_string(text, image)
|
|
336
|
+
|
|
337
|
+
return content
|
|
298
338
|
|
|
299
339
|
|
|
300
|
-
def _get_pandas_image_content(row):
|
|
340
|
+
def _get_pandas_image_content(row, modality="text"):
|
|
301
341
|
"""
|
|
302
342
|
Extracts image caption content from a DataFrame row.
|
|
303
343
|
|
|
@@ -311,10 +351,28 @@ def _get_pandas_image_content(row):
|
|
|
311
351
|
str
|
|
312
352
|
The image caption from the row.
|
|
313
353
|
"""
|
|
314
|
-
|
|
354
|
+
subtype = row.get("content_metadata", {}).get("subtype")
|
|
355
|
+
if modality == "text":
|
|
356
|
+
if subtype == "page_image":
|
|
357
|
+
content = row.get("image_metadata", {}).get("text")
|
|
358
|
+
else:
|
|
359
|
+
content = row.get("image_metadata", {}).get("caption")
|
|
360
|
+
elif modality == "image":
|
|
361
|
+
content = _format_image_input_string(row.get("content"))
|
|
362
|
+
elif modality == "text_image":
|
|
363
|
+
if subtype == "page_image":
|
|
364
|
+
text = row.get("image_metadata", {}).get("text")
|
|
365
|
+
else:
|
|
366
|
+
text = row.get("image_metadata", {}).get("caption")
|
|
367
|
+
image = row.get("content")
|
|
368
|
+
content = _format_text_image_pair_input_string(text, image)
|
|
315
369
|
|
|
370
|
+
# A workaround to save memory.
|
|
371
|
+
row["content"] = ""
|
|
372
|
+
return content
|
|
316
373
|
|
|
317
|
-
|
|
374
|
+
|
|
375
|
+
def _get_pandas_audio_content(row, modality="text"):
|
|
318
376
|
"""
|
|
319
377
|
A pandas UDF used to select extracted audio transcription to be used to create embeddings.
|
|
320
378
|
"""
|
|
@@ -408,6 +466,23 @@ def _concatenate_extractions_pandas(
|
|
|
408
466
|
# ------------------------------------------------------------------------------
|
|
409
467
|
|
|
410
468
|
|
|
469
|
+
def does_model_support_multimodal_embeddings(model: str) -> bool:
|
|
470
|
+
"""
|
|
471
|
+
Checks if a given model supports multi-modal embeddings.
|
|
472
|
+
|
|
473
|
+
Parameters
|
|
474
|
+
----------
|
|
475
|
+
model : str
|
|
476
|
+
The name of the model.
|
|
477
|
+
|
|
478
|
+
Returns
|
|
479
|
+
-------
|
|
480
|
+
bool
|
|
481
|
+
True if the model supports multi-modal embeddings, False otherwise.
|
|
482
|
+
"""
|
|
483
|
+
return model in MULTI_MODAL_MODELS
|
|
484
|
+
|
|
485
|
+
|
|
411
486
|
def transform_create_text_embeddings_internal(
|
|
412
487
|
df_transform_ledger: pd.DataFrame,
|
|
413
488
|
task_config: Dict[str, Any],
|
|
@@ -460,6 +535,15 @@ def transform_create_text_embeddings_internal(
|
|
|
460
535
|
ContentTypeEnum.AUDIO: _get_pandas_audio_content,
|
|
461
536
|
ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
|
|
462
537
|
}
|
|
538
|
+
task_type_to_modality = {
|
|
539
|
+
ContentTypeEnum.TEXT: task_config.get("text_elements_modality") or transform_config.text_elements_modality,
|
|
540
|
+
ContentTypeEnum.STRUCTURED: (
|
|
541
|
+
task_config.get("structured_elements_modality") or transform_config.structured_elements_modality
|
|
542
|
+
),
|
|
543
|
+
ContentTypeEnum.IMAGE: task_config.get("image_elements_modality") or transform_config.image_elements_modality,
|
|
544
|
+
ContentTypeEnum.AUDIO: task_config.get("audio_elements_modality") or transform_config.audio_elements_modality,
|
|
545
|
+
ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
|
|
546
|
+
}
|
|
463
547
|
|
|
464
548
|
def _content_type_getter(row):
|
|
465
549
|
return row["content_metadata"]["type"]
|
|
@@ -480,7 +564,7 @@ def transform_create_text_embeddings_internal(
|
|
|
480
564
|
# Extract content and normalize empty or non-str to None
|
|
481
565
|
extracted_content = (
|
|
482
566
|
df_content["metadata"]
|
|
483
|
-
.apply(content_getter)
|
|
567
|
+
.apply(partial(content_getter, modality=task_type_to_modality[content_type]))
|
|
484
568
|
.apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
|
|
485
569
|
)
|
|
486
570
|
df_content["_content"] = extracted_content
|
|
@@ -488,9 +572,15 @@ def transform_create_text_embeddings_internal(
|
|
|
488
572
|
# Prepare batches for only valid (non-None) content
|
|
489
573
|
valid_content_mask = df_content["_content"].notna()
|
|
490
574
|
if valid_content_mask.any():
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
575
|
+
filtered_content_list = df_content.loc[valid_content_mask, "_content"].tolist()
|
|
576
|
+
filtered_content_batches = _generate_batches(filtered_content_list, batch_size=transform_config.batch_size)
|
|
577
|
+
|
|
578
|
+
if model_name in MULTI_MODAL_MODELS:
|
|
579
|
+
modality_list = [task_type_to_modality[content_type]] * len(filtered_content_list)
|
|
580
|
+
modality_batches = _generate_batches(modality_list, batch_size=transform_config.batch_size)
|
|
581
|
+
else:
|
|
582
|
+
modality_batches = None
|
|
583
|
+
|
|
494
584
|
content_embeddings = _async_runner(
|
|
495
585
|
filtered_content_batches,
|
|
496
586
|
api_key,
|
|
@@ -500,6 +590,7 @@ def transform_create_text_embeddings_internal(
|
|
|
500
590
|
transform_config.input_type,
|
|
501
591
|
transform_config.truncate,
|
|
502
592
|
False,
|
|
593
|
+
modalities=modality_batches,
|
|
503
594
|
)
|
|
504
595
|
# Build a simple row index -> embedding map
|
|
505
596
|
embeddings_dict = dict(
|
|
@@ -141,14 +141,19 @@ def transform_text_split_and_tokenize_internal(
|
|
|
141
141
|
|
|
142
142
|
model_predownload_path = os.environ.get("MODEL_PREDOWNLOAD_PATH")
|
|
143
143
|
|
|
144
|
-
if
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
144
|
+
if model_predownload_path is not None:
|
|
145
|
+
if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
|
|
146
|
+
tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
|
|
147
|
+
):
|
|
148
|
+
tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
|
|
149
|
+
elif os.path.exists(
|
|
150
|
+
os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")
|
|
151
|
+
) and (tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"):
|
|
152
|
+
tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
|
|
153
|
+
|
|
154
|
+
# Defaulto to intfloat/e5-large-unsupervised if no tokenizer predownloaded or specified
|
|
155
|
+
if tokenizer_identifier is None:
|
|
156
|
+
tokenizer_identifier = "intfloat/e5-large-unsupervised"
|
|
152
157
|
|
|
153
158
|
tokenizer_model = AutoTokenizer.from_pretrained(tokenizer_identifier, token=hf_access_token)
|
|
154
159
|
|
|
@@ -46,14 +46,14 @@ def process_yolox_graphic_elements(yolox_text_dict):
|
|
|
46
46
|
return chart_content.strip()
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
def match_bboxes(yolox_box,
|
|
49
|
+
def match_bboxes(yolox_box, ocr_boxes, already_matched=None, delta=2.0):
|
|
50
50
|
"""
|
|
51
51
|
Associates a yolox-graphic-elements box to PaddleOCR bboxes, by taking overlapping boxes.
|
|
52
52
|
Criterion is iou > max_iou / delta where max_iou is the biggest found overlap.
|
|
53
53
|
Boxes are expeceted in format (x0, y0, x1, y1)
|
|
54
54
|
Args:
|
|
55
55
|
yolox_box (np array [4]): Cached Bbox.
|
|
56
|
-
|
|
56
|
+
ocr_boxes (np array [n x 4]): PaddleOCR boxes
|
|
57
57
|
already_matched (list or None, Optional): Already matched ids to ignore.
|
|
58
58
|
delta (float, Optional): IoU delta for considering several boxes. Defaults to 2..
|
|
59
59
|
Returns:
|
|
@@ -61,10 +61,10 @@ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
|
|
|
61
61
|
"""
|
|
62
62
|
x0_1, y0_1, x1_1, y1_1 = yolox_box
|
|
63
63
|
x0_2, y0_2, x1_2, y1_2 = (
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
64
|
+
ocr_boxes[:, 0],
|
|
65
|
+
ocr_boxes[:, 1],
|
|
66
|
+
ocr_boxes[:, 2],
|
|
67
|
+
ocr_boxes[:, 3],
|
|
68
68
|
)
|
|
69
69
|
|
|
70
70
|
# Intersection
|
|
@@ -92,10 +92,10 @@ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
|
|
|
92
92
|
return matches
|
|
93
93
|
|
|
94
94
|
|
|
95
|
-
def
|
|
95
|
+
def join_yolox_graphic_elements_and_ocr_output(yolox_output, ocr_boxes, ocr_txts):
|
|
96
96
|
"""
|
|
97
97
|
Matching boxes
|
|
98
|
-
We need to associate a text to the
|
|
98
|
+
We need to associate a text to the ocr detections.
|
|
99
99
|
For each class and for each CACHED detections, we look for overlapping text bboxes
|
|
100
100
|
with IoU > max_iou / delta where max_iou is the biggest found overlap.
|
|
101
101
|
Found texts are added to the class representation, and removed from the texts to match
|
|
@@ -113,18 +113,18 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
|
|
|
113
113
|
"value_label",
|
|
114
114
|
]
|
|
115
115
|
|
|
116
|
-
|
|
117
|
-
|
|
116
|
+
ocr_txts = np.array(ocr_txts)
|
|
117
|
+
ocr_boxes = np.array(ocr_boxes)
|
|
118
118
|
|
|
119
|
-
if (
|
|
119
|
+
if (ocr_txts.size == 0) or (ocr_boxes.size == 0):
|
|
120
120
|
return {}
|
|
121
121
|
|
|
122
|
-
|
|
122
|
+
ocr_boxes = np.array(
|
|
123
123
|
[
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
124
|
+
ocr_boxes[:, :, 0].min(-1),
|
|
125
|
+
ocr_boxes[:, :, 1].min(-1),
|
|
126
|
+
ocr_boxes[:, :, 0].max(-1),
|
|
127
|
+
ocr_boxes[:, :, 1].max(-1),
|
|
128
128
|
]
|
|
129
129
|
).T
|
|
130
130
|
|
|
@@ -139,10 +139,10 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
|
|
|
139
139
|
for yolox_box in yolox_output[k]:
|
|
140
140
|
# if there's a score at the end, drop the score.
|
|
141
141
|
yolox_box = yolox_box[:4]
|
|
142
|
-
|
|
142
|
+
ocr_ids = match_bboxes(yolox_box, ocr_boxes, already_matched=already_matched, delta=4)
|
|
143
143
|
|
|
144
|
-
if len(
|
|
145
|
-
text = " ".join(
|
|
144
|
+
if len(ocr_ids) > 0:
|
|
145
|
+
text = " ".join(ocr_txts[ocr_ids].tolist())
|
|
146
146
|
texts.append(text)
|
|
147
147
|
|
|
148
148
|
processed_texts = []
|
|
@@ -161,7 +161,7 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
|
|
|
161
161
|
return results
|
|
162
162
|
|
|
163
163
|
|
|
164
|
-
def
|
|
164
|
+
def convert_ocr_response_to_psuedo_markdown(bboxes, texts):
|
|
165
165
|
if (not bboxes) or (not texts):
|
|
166
166
|
return ""
|
|
167
167
|
|
|
@@ -186,22 +186,22 @@ def convert_paddle_response_to_psuedo_markdown(bboxes, texts):
|
|
|
186
186
|
return results
|
|
187
187
|
|
|
188
188
|
|
|
189
|
-
def
|
|
190
|
-
if (not
|
|
189
|
+
def join_yolox_table_structure_and_ocr_output(yolox_cell_preds, ocr_boxes, ocr_txts):
|
|
190
|
+
if (not ocr_boxes) or (not ocr_txts):
|
|
191
191
|
return ""
|
|
192
192
|
|
|
193
|
-
|
|
194
|
-
|
|
193
|
+
ocr_boxes = np.array(ocr_boxes)
|
|
194
|
+
ocr_boxes_ = np.array(
|
|
195
195
|
[
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
196
|
+
ocr_boxes[:, :, 0].min(-1),
|
|
197
|
+
ocr_boxes[:, :, 1].min(-1),
|
|
198
|
+
ocr_boxes[:, :, 0].max(-1),
|
|
199
|
+
ocr_boxes[:, :, 1].max(-1),
|
|
200
200
|
]
|
|
201
201
|
).T
|
|
202
202
|
|
|
203
203
|
assignments = []
|
|
204
|
-
for i, (b, t) in enumerate(zip(
|
|
204
|
+
for i, (b, t) in enumerate(zip(ocr_boxes_, ocr_txts)):
|
|
205
205
|
# Find a cell
|
|
206
206
|
matches_cell = assign_boxes(b, yolox_cell_preds["cell"], delta=1)
|
|
207
207
|
cell = yolox_cell_preds["cell"][matches_cell[0]] if len(matches_cell) else b
|
|
@@ -221,7 +221,7 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
|
|
|
221
221
|
assignments.append(
|
|
222
222
|
{
|
|
223
223
|
"index": i,
|
|
224
|
-
"
|
|
224
|
+
"ocr_box": b,
|
|
225
225
|
"is_table": isinstance(col_ids, np.ndarray) and isinstance(row_ids, np.ndarray),
|
|
226
226
|
"cell_id": matches_cell[0] if len(matches_cell) else -1,
|
|
227
227
|
"cell": cell,
|
|
@@ -249,13 +249,13 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
|
|
|
249
249
|
mat = build_markdown(df_table)
|
|
250
250
|
markdown_table = display_markdown(mat, use_header=False)
|
|
251
251
|
|
|
252
|
-
all_boxes = np.stack(df_table.
|
|
252
|
+
all_boxes = np.stack(df_table.ocr_box.values)
|
|
253
253
|
table_box = np.concatenate([all_boxes[:, [0, 1]].min(0), all_boxes[:, [2, 3]].max(0)])
|
|
254
254
|
|
|
255
255
|
df_table_to_text = pd.DataFrame(
|
|
256
256
|
[
|
|
257
257
|
{
|
|
258
|
-
"
|
|
258
|
+
"ocr_box": table_box,
|
|
259
259
|
"text": markdown_table,
|
|
260
260
|
"is_table": True,
|
|
261
261
|
}
|
|
@@ -264,7 +264,7 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
|
|
|
264
264
|
# Final text representations dataframe
|
|
265
265
|
df_text = pd.concat([df_text, df_table_to_text], ignore_index=True)
|
|
266
266
|
|
|
267
|
-
df_text = df_text.rename(columns={"
|
|
267
|
+
df_text = df_text.rename(columns={"ocr_box": "box"})
|
|
268
268
|
|
|
269
269
|
# Sort by y and x
|
|
270
270
|
df_text["x"] = df_text["box"].apply(lambda x: (x[0] + x[2]) / 2)
|
|
@@ -297,12 +297,12 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
|
|
|
297
297
|
return result
|
|
298
298
|
|
|
299
299
|
|
|
300
|
-
def assign_boxes(
|
|
300
|
+
def assign_boxes(ocr_box, boxes, delta=2.0, min_overlap=0.25):
|
|
301
301
|
"""
|
|
302
|
-
Assigns the closest bounding boxes to a reference `
|
|
302
|
+
Assigns the closest bounding boxes to a reference `ocr_box` based on overlap.
|
|
303
303
|
|
|
304
304
|
Args:
|
|
305
|
-
|
|
305
|
+
ocr_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
|
|
306
306
|
boxes (numpy.ndarray): Array of candidate bounding boxes with shape (N, 4).
|
|
307
307
|
delta (float, optional): Factor for matches relative to the best overlap. Defaults to 2.0.
|
|
308
308
|
min_overlap (float, optional): Minimum required overlap for a match. Defaults to 0.25.
|
|
@@ -316,7 +316,7 @@ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
|
|
|
316
316
|
|
|
317
317
|
boxes = np.array(boxes)
|
|
318
318
|
|
|
319
|
-
x0_1, y0_1, x1_1, y1_1 =
|
|
319
|
+
x0_1, y0_1, x1_1, y1_1 = ocr_box
|
|
320
320
|
x0_2, y0_2, x1_2, y1_2 = (
|
|
321
321
|
boxes[:, 0],
|
|
322
322
|
boxes[:, 1],
|
|
@@ -331,7 +331,7 @@ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
|
|
|
331
331
|
inter_x1 = np.minimum(x1_1, x1_2)
|
|
332
332
|
inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
|
|
333
333
|
|
|
334
|
-
# Normalize by
|
|
334
|
+
# Normalize by ocr_box size
|
|
335
335
|
area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
|
|
336
336
|
ious = inter_area / (area_1 + 1e-6)
|
|
337
337
|
|
|
@@ -385,16 +385,16 @@ def merge_text_in_cell(df_cell):
|
|
|
385
385
|
Returns:
|
|
386
386
|
pandas.DataFrame: Updated DataFrame with merged text and a single bounding box.
|
|
387
387
|
"""
|
|
388
|
-
|
|
388
|
+
ocr_boxes = np.stack(df_cell["ocr_box"].values)
|
|
389
389
|
|
|
390
|
-
df_cell["x"] = (
|
|
391
|
-
df_cell["y"] = (
|
|
390
|
+
df_cell["x"] = (ocr_boxes[:, 0] - ocr_boxes[:, 0].min()) // 10
|
|
391
|
+
df_cell["y"] = (ocr_boxes[:, 1] - ocr_boxes[:, 1].min()) // 10
|
|
392
392
|
df_cell = df_cell.sort_values(["y", "x"])
|
|
393
393
|
|
|
394
394
|
text = " ".join(df_cell["text"].values.tolist())
|
|
395
395
|
df_cell["text"] = text
|
|
396
396
|
df_cell = df_cell.head(1)
|
|
397
|
-
df_cell["
|
|
397
|
+
df_cell["ocr_box"] = df_cell["cell"]
|
|
398
398
|
df_cell.drop(["x", "y"], axis=1, inplace=True)
|
|
399
399
|
|
|
400
400
|
return df_cell
|
|
@@ -447,3 +447,58 @@ def display_markdown(
|
|
|
447
447
|
markdown_table = "\n".join("| " + " | ".join(row) + " |" for row in data)
|
|
448
448
|
|
|
449
449
|
return markdown_table
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def reorder_boxes(boxes, texts, confs, mode="top_left", dbscan_eps=10):
|
|
453
|
+
"""
|
|
454
|
+
Reorders the boxes in reading order.
|
|
455
|
+
If mode is "center", the boxes are reordered using bbox center.
|
|
456
|
+
If mode is "top_left", the boxes are reordered using the top left corner.
|
|
457
|
+
If dbscan_eps is not 0, the boxes are reordered using DBSCAN clustering.
|
|
458
|
+
|
|
459
|
+
Args:
|
|
460
|
+
boxes (np array [n x 4 x 2]): The bounding boxes of the OCR results.
|
|
461
|
+
texts (np array [n]): The text of the OCR results.
|
|
462
|
+
confs (np array [n]): The confidence scores of the OCR results.
|
|
463
|
+
mode (str, optional): The mode to reorder the boxes. Defaults to "center".
|
|
464
|
+
dbscan_eps (float, optional): The epsilon parameter for DBSCAN. Defaults to 10.
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
List[List[int, ...]]: The reordered bounding boxes.
|
|
468
|
+
List[str]: The reordered texts.
|
|
469
|
+
List[float]: The reordered confidence scores.
|
|
470
|
+
"""
|
|
471
|
+
df = pd.DataFrame(
|
|
472
|
+
[[b, t, c] for b, t, c in zip(boxes, texts, confs)],
|
|
473
|
+
columns=["bbox", "text", "conf"],
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
if mode == "center":
|
|
477
|
+
df["x"] = df["bbox"].apply(lambda box: (box[0][0] + box[2][0]) / 2)
|
|
478
|
+
df["y"] = df["bbox"].apply(lambda box: (box[0][1] + box[2][1]) / 2)
|
|
479
|
+
elif mode == "top_left":
|
|
480
|
+
df["x"] = df["bbox"].apply(lambda box: (box[0][0]))
|
|
481
|
+
df["y"] = df["bbox"].apply(lambda box: (box[0][1]))
|
|
482
|
+
|
|
483
|
+
if dbscan_eps:
|
|
484
|
+
do_naive_sorting = False
|
|
485
|
+
try:
|
|
486
|
+
dbscan = DBSCAN(eps=dbscan_eps, min_samples=1)
|
|
487
|
+
dbscan.fit(df["y"].values[:, None])
|
|
488
|
+
df["cluster"] = dbscan.labels_
|
|
489
|
+
df["cluster_centers"] = df.groupby("cluster")["y"].transform("mean").astype(int)
|
|
490
|
+
df = df.sort_values(["cluster_centers", "x"], ascending=[True, True], ignore_index=True)
|
|
491
|
+
except ValueError:
|
|
492
|
+
do_naive_sorting = True
|
|
493
|
+
else:
|
|
494
|
+
do_naive_sorting = True
|
|
495
|
+
|
|
496
|
+
if do_naive_sorting:
|
|
497
|
+
df["y"] = np.round((df["y"] - df["y"].min()) // 5, 0)
|
|
498
|
+
df = df.sort_values(["y", "x"], ascending=[True, True], ignore_index=True)
|
|
499
|
+
|
|
500
|
+
bboxes = df["bbox"].values.tolist()
|
|
501
|
+
texts = df["text"].values.tolist()
|
|
502
|
+
confs = df["conf"].values.tolist()
|
|
503
|
+
|
|
504
|
+
return bboxes, texts, confs
|
|
@@ -20,6 +20,9 @@ cv2.setNumThreads(1)
|
|
|
20
20
|
DEFAULT_MAX_WIDTH = 1024
|
|
21
21
|
DEFAULT_MAX_HEIGHT = 1280
|
|
22
22
|
|
|
23
|
+
# Workaround for PIL.Image.DecompressionBombError
|
|
24
|
+
Image.MAX_IMAGE_PIXELS = None
|
|
25
|
+
|
|
23
26
|
logger = logging.getLogger(__name__)
|
|
24
27
|
|
|
25
28
|
|
|
@@ -206,6 +209,7 @@ def pad_image(
|
|
|
206
209
|
target_height: int = DEFAULT_MAX_HEIGHT,
|
|
207
210
|
background_color: int = 255,
|
|
208
211
|
dtype=np.uint8,
|
|
212
|
+
how: str = "center",
|
|
209
213
|
) -> Tuple[np.ndarray, Tuple[int, int]]:
|
|
210
214
|
"""
|
|
211
215
|
Pads a NumPy array representing an image to the specified target dimensions.
|
|
@@ -214,6 +218,8 @@ def pad_image(
|
|
|
214
218
|
in that dimension. If the target dimensions are larger, the image will be centered within the
|
|
215
219
|
canvas of the specified target size, with the remaining space filled with white padding.
|
|
216
220
|
|
|
221
|
+
The padding can be done around the center (how="center"), or to the bottom right (how="bottom_right").
|
|
222
|
+
|
|
217
223
|
Parameters
|
|
218
224
|
----------
|
|
219
225
|
array : np.ndarray
|
|
@@ -222,6 +228,8 @@ def pad_image(
|
|
|
222
228
|
The desired target width of the padded image. Defaults to DEFAULT_MAX_WIDTH.
|
|
223
229
|
target_height : int, optional
|
|
224
230
|
The desired target height of the padded image. Defaults to DEFAULT_MAX_HEIGHT.
|
|
231
|
+
how : str, optional
|
|
232
|
+
The method to pad the image. Defaults to "center".
|
|
225
233
|
|
|
226
234
|
Returns
|
|
227
235
|
-------
|
|
@@ -246,17 +254,23 @@ def pad_image(
|
|
|
246
254
|
"""
|
|
247
255
|
height, width = array.shape[:2]
|
|
248
256
|
|
|
249
|
-
# Determine the padding needed, if any, while ensuring no padding is applied if the target is smaller
|
|
250
|
-
pad_height = max((target_height - height) // 2, 0)
|
|
251
|
-
pad_width = max((target_width - width) // 2, 0)
|
|
252
|
-
|
|
253
257
|
# Determine final canvas size (may be equal to original if target is smaller)
|
|
254
258
|
final_height = max(height, target_height)
|
|
255
259
|
final_width = max(width, target_width)
|
|
256
260
|
|
|
257
261
|
# Create the canvas and place the original image on it
|
|
258
262
|
canvas = background_color * np.ones((final_height, final_width, array.shape[2]), dtype=dtype)
|
|
259
|
-
|
|
263
|
+
|
|
264
|
+
# Determine the padding needed, if any, while ensuring no padding is applied if the target is smaller
|
|
265
|
+
if how == "center":
|
|
266
|
+
pad_height = max((target_height - height) // 2, 0)
|
|
267
|
+
pad_width = max((target_width - width) // 2, 0)
|
|
268
|
+
|
|
269
|
+
canvas[pad_height : pad_height + height, pad_width : pad_width + width] = array # noqa: E203
|
|
270
|
+
elif how == "bottom_right":
|
|
271
|
+
pad_height, pad_width = 0, 0
|
|
272
|
+
|
|
273
|
+
canvas[:height, :width] = array # noqa: E203
|
|
260
274
|
|
|
261
275
|
return canvas, (pad_width, pad_height)
|
|
262
276
|
|
|
@@ -250,7 +250,7 @@ class SimpleMessageBrokerHandler(socketserver.BaseRequestHandler):
|
|
|
250
250
|
with queue_lock:
|
|
251
251
|
if queue.empty():
|
|
252
252
|
# Return failure response immediately
|
|
253
|
-
response = ResponseSchema(response_code=
|
|
253
|
+
response = ResponseSchema(response_code=2, response_reason="Job not ready")
|
|
254
254
|
self._send_response(response)
|
|
255
255
|
return
|
|
256
256
|
# Pop the message from the queue
|