nv-ingest-api 25.7.7.dev20250707__py3-none-any.whl → 25.8.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest-api might be problematic. Click here for more details.
- nv_ingest_api/interface/extract.py +18 -18
- nv_ingest_api/internal/enums/common.py +6 -0
- nv_ingest_api/internal/extract/image/chart_extractor.py +80 -75
- nv_ingest_api/internal/extract/image/image_helpers/common.py +5 -6
- nv_ingest_api/internal/extract/image/infographic_extractor.py +59 -35
- nv_ingest_api/internal/extract/image/table_extractor.py +84 -64
- nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +9 -8
- nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +32 -20
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py +40 -29
- nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +59 -0
- nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +1 -0
- nv_ingest_api/internal/primitives/nim/model_interface/{paddle.py → ocr.py} +132 -39
- nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +44 -236
- nv_ingest_api/internal/primitives/nim/nim_client.py +61 -18
- nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +6 -6
- nv_ingest_api/internal/schemas/extract/extract_table_schema.py +5 -5
- nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +5 -0
- nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +1 -1
- nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +4 -0
- nv_ingest_api/internal/transform/embed_text.py +105 -12
- nv_ingest_api/internal/transform/split_text.py +13 -8
- nv_ingest_api/util/image_processing/table_and_chart.py +97 -42
- nv_ingest_api/util/image_processing/transforms.py +351 -87
- nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +1 -1
- nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +51 -48
- nv_ingest_api/util/metadata/aggregators.py +4 -1
- nv_ingest_api/util/pdf/pdfium.py +6 -14
- {nv_ingest_api-25.7.7.dev20250707.dist-info → nv_ingest_api-25.8.0rc2.dist-info}/METADATA +2 -1
- {nv_ingest_api-25.7.7.dev20250707.dist-info → nv_ingest_api-25.8.0rc2.dist-info}/RECORD +33 -33
- {nv_ingest_api-25.7.7.dev20250707.dist-info → nv_ingest_api-25.8.0rc2.dist-info}/WHEEL +0 -0
- {nv_ingest_api-25.7.7.dev20250707.dist-info → nv_ingest_api-25.8.0rc2.dist-info}/licenses/LICENSE +0 -0
- {nv_ingest_api-25.7.7.dev20250707.dist-info → nv_ingest_api-25.8.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -22,8 +22,8 @@ class TableExtractorConfigSchema(BaseModel):
|
|
|
22
22
|
auth_token : Optional[str], default=None
|
|
23
23
|
Authentication token required for secure services.
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
A tuple containing the gRPC and HTTP services for the
|
|
25
|
+
ocr_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
|
|
26
|
+
A tuple containing the gRPC and HTTP services for the ocr endpoint.
|
|
27
27
|
Either the gRPC or HTTP service can be empty, but not both.
|
|
28
28
|
|
|
29
29
|
Methods
|
|
@@ -47,8 +47,8 @@ class TableExtractorConfigSchema(BaseModel):
|
|
|
47
47
|
yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
48
48
|
yolox_infer_protocol: str = ""
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
ocr_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
|
|
51
|
+
ocr_infer_protocol: str = ""
|
|
52
52
|
|
|
53
53
|
nim_batch_size: int = 2
|
|
54
54
|
workers_per_progress_engine: int = 5
|
|
@@ -81,7 +81,7 @@ class TableExtractorConfigSchema(BaseModel):
|
|
|
81
81
|
return None
|
|
82
82
|
return service
|
|
83
83
|
|
|
84
|
-
for endpoint_name in ["yolox_endpoints", "
|
|
84
|
+
for endpoint_name in ["yolox_endpoints", "ocr_endpoints"]:
|
|
85
85
|
grpc_service, http_service = values.get(endpoint_name, (None, None))
|
|
86
86
|
grpc_service = clean_service(grpc_service)
|
|
87
87
|
http_service = clean_service(http_service)
|
|
@@ -107,6 +107,10 @@ class IngestTaskEmbedSchema(BaseModelNoExt):
|
|
|
107
107
|
model_name: Optional[str] = None
|
|
108
108
|
api_key: Optional[str] = None
|
|
109
109
|
filter_errors: bool = False
|
|
110
|
+
text_elements_modality: Optional[str] = None
|
|
111
|
+
image_elements_modality: Optional[str] = None
|
|
112
|
+
structured_elements_modality: Optional[str] = None
|
|
113
|
+
audio_elements_modality: Optional[str] = None
|
|
110
114
|
|
|
111
115
|
|
|
112
116
|
class IngestTaskVdbUploadSchema(BaseModelNoExt):
|
|
@@ -195,6 +199,7 @@ class IngestTaskSchema(BaseModelNoExt):
|
|
|
195
199
|
validated_task_properties = expected_schema_cls(**task_properties)
|
|
196
200
|
values["type"] = task_type # ensure type is now always the enum
|
|
197
201
|
values["task_properties"] = validated_task_properties
|
|
202
|
+
|
|
198
203
|
return values
|
|
199
204
|
|
|
200
205
|
@field_validator("type", mode="before")
|
|
@@ -8,7 +8,7 @@ from pydantic import ConfigDict, BaseModel
|
|
|
8
8
|
|
|
9
9
|
class ImageCaptionExtractionSchema(BaseModel):
|
|
10
10
|
api_key: str = "api_key"
|
|
11
|
-
endpoint_url: str = "https://
|
|
11
|
+
endpoint_url: str = "https://integrate.api.nvidia.com/v1/chat/completions"
|
|
12
12
|
prompt: str = "Caption the content of this image:"
|
|
13
13
|
model_name: str = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
|
|
14
14
|
raise_on_failure: bool = False
|
|
@@ -22,5 +22,9 @@ class TextEmbeddingSchema(BaseModel):
|
|
|
22
22
|
input_type: str = Field(default="passage")
|
|
23
23
|
raise_on_failure: bool = Field(default=False)
|
|
24
24
|
truncate: str = Field(default="END")
|
|
25
|
+
text_elements_modality: str = Field(default="text")
|
|
26
|
+
image_elements_modality: str = Field(default="text")
|
|
27
|
+
structured_elements_modality: str = Field(default="text")
|
|
28
|
+
audio_elements_modality: str = Field(default="text")
|
|
25
29
|
|
|
26
30
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from functools import partial
|
|
7
8
|
from typing import Any, Dict, Tuple, Optional, Iterable, List
|
|
8
9
|
|
|
9
10
|
import pandas as pd
|
|
@@ -19,6 +20,9 @@ from nv_ingest_api.util.schema.schema_validator import validate_schema
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
21
22
|
|
|
23
|
+
MULTI_MODAL_MODELS = ["llama-3.2-nemoretriever-1b-vlm-embed-v1"]
|
|
24
|
+
|
|
25
|
+
|
|
22
26
|
# ------------------------------------------------------------------------------
|
|
23
27
|
# Asynchronous Embedding Requests
|
|
24
28
|
# ------------------------------------------------------------------------------
|
|
@@ -33,6 +37,7 @@ def _make_async_request(
|
|
|
33
37
|
input_type: str,
|
|
34
38
|
truncate: str,
|
|
35
39
|
filter_errors: bool,
|
|
40
|
+
modalities: Optional[List[str]] = None,
|
|
36
41
|
) -> list:
|
|
37
42
|
"""
|
|
38
43
|
Interacts directly with the NIM embedding service to calculate embeddings for a batch of prompts.
|
|
@@ -74,11 +79,18 @@ def _make_async_request(
|
|
|
74
79
|
base_url=embedding_nim_endpoint,
|
|
75
80
|
)
|
|
76
81
|
|
|
82
|
+
extra_body = {
|
|
83
|
+
"input_type": input_type,
|
|
84
|
+
"truncate": truncate,
|
|
85
|
+
}
|
|
86
|
+
if modalities:
|
|
87
|
+
extra_body["modality"] = modalities
|
|
88
|
+
|
|
77
89
|
resp = client.embeddings.create(
|
|
78
90
|
input=prompts,
|
|
79
91
|
model=embedding_model,
|
|
80
92
|
encoding_format=encoding_format,
|
|
81
|
-
extra_body=
|
|
93
|
+
extra_body=extra_body,
|
|
82
94
|
)
|
|
83
95
|
|
|
84
96
|
response["embedding"] = resp.data
|
|
@@ -110,6 +122,7 @@ def _async_request_handler(
|
|
|
110
122
|
input_type: str,
|
|
111
123
|
truncate: str,
|
|
112
124
|
filter_errors: bool,
|
|
125
|
+
modalities: Optional[List[str]] = None,
|
|
113
126
|
) -> List[dict]:
|
|
114
127
|
"""
|
|
115
128
|
Gathers calculated embedding results from the NIM embedding service concurrently.
|
|
@@ -138,6 +151,9 @@ def _async_request_handler(
|
|
|
138
151
|
List[dict]
|
|
139
152
|
A list of response dictionaries from the embedding service.
|
|
140
153
|
"""
|
|
154
|
+
if modalities is None:
|
|
155
|
+
modalities = [None] * len(prompts)
|
|
156
|
+
|
|
141
157
|
with ThreadPoolExecutor() as executor:
|
|
142
158
|
futures = [
|
|
143
159
|
executor.submit(
|
|
@@ -150,8 +166,9 @@ def _async_request_handler(
|
|
|
150
166
|
input_type=input_type,
|
|
151
167
|
truncate=truncate,
|
|
152
168
|
filter_errors=filter_errors,
|
|
169
|
+
modalities=modality_batch,
|
|
153
170
|
)
|
|
154
|
-
for prompt_batch in prompts
|
|
171
|
+
for prompt_batch, modality_batch in zip(prompts, modalities)
|
|
155
172
|
]
|
|
156
173
|
results = [future.result() for future in futures]
|
|
157
174
|
|
|
@@ -167,6 +184,7 @@ def _async_runner(
|
|
|
167
184
|
input_type: str,
|
|
168
185
|
truncate: str,
|
|
169
186
|
filter_errors: bool,
|
|
187
|
+
modalities: Optional[List[str]] = None,
|
|
170
188
|
) -> dict:
|
|
171
189
|
"""
|
|
172
190
|
Concurrently launches all NIM embedding requests and flattens the results.
|
|
@@ -204,6 +222,7 @@ def _async_runner(
|
|
|
204
222
|
input_type,
|
|
205
223
|
truncate,
|
|
206
224
|
filter_errors,
|
|
225
|
+
modalities=modalities,
|
|
207
226
|
)
|
|
208
227
|
|
|
209
228
|
flat_results = {"embeddings": [], "info_msgs": []}
|
|
@@ -263,7 +282,19 @@ def _add_embeddings(row, embeddings, info_msgs):
|
|
|
263
282
|
return row
|
|
264
283
|
|
|
265
284
|
|
|
266
|
-
def
|
|
285
|
+
def _format_image_input_string(image_b64: Optional[str]) -> str:
|
|
286
|
+
if not image_b64:
|
|
287
|
+
return
|
|
288
|
+
return f"data:image/png;base64,{image_b64}"
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _format_text_image_pair_input_string(text: Optional[str], image_b64: Optional[str]) -> str:
|
|
292
|
+
if (not text) or (not text.strip()) or (not image_b64):
|
|
293
|
+
return
|
|
294
|
+
return f"{text.strip()} {_format_image_input_string(image_b64)}"
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _get_pandas_text_content(row, modality="text"):
|
|
267
298
|
"""
|
|
268
299
|
Extracts text content from a DataFrame row.
|
|
269
300
|
|
|
@@ -280,7 +311,7 @@ def _get_pandas_text_content(row):
|
|
|
280
311
|
return row["content"]
|
|
281
312
|
|
|
282
313
|
|
|
283
|
-
def _get_pandas_table_content(row):
|
|
314
|
+
def _get_pandas_table_content(row, modality="text"):
|
|
284
315
|
"""
|
|
285
316
|
Extracts table/chart content from a DataFrame row.
|
|
286
317
|
|
|
@@ -294,10 +325,19 @@ def _get_pandas_table_content(row):
|
|
|
294
325
|
str
|
|
295
326
|
The table/chart content from the row.
|
|
296
327
|
"""
|
|
297
|
-
|
|
328
|
+
if modality == "text":
|
|
329
|
+
content = row.get("table_metadata", {}).get("table_content")
|
|
330
|
+
elif modality == "image":
|
|
331
|
+
content = _format_image_input_string(row.get("content"))
|
|
332
|
+
elif modality == "text_image":
|
|
333
|
+
text = row.get("table_metadata", {}).get("table_content")
|
|
334
|
+
image = row.get("content")
|
|
335
|
+
content = _format_text_image_pair_input_string(text, image)
|
|
336
|
+
|
|
337
|
+
return content
|
|
298
338
|
|
|
299
339
|
|
|
300
|
-
def _get_pandas_image_content(row):
|
|
340
|
+
def _get_pandas_image_content(row, modality="text"):
|
|
301
341
|
"""
|
|
302
342
|
Extracts image caption content from a DataFrame row.
|
|
303
343
|
|
|
@@ -311,10 +351,30 @@ def _get_pandas_image_content(row):
|
|
|
311
351
|
str
|
|
312
352
|
The image caption from the row.
|
|
313
353
|
"""
|
|
314
|
-
|
|
354
|
+
subtype = row.get("content_metadata", {}).get("subtype")
|
|
355
|
+
if modality == "text":
|
|
356
|
+
if subtype == "page_image":
|
|
357
|
+
content = row.get("image_metadata", {}).get("text")
|
|
358
|
+
else:
|
|
359
|
+
content = row.get("image_metadata", {}).get("caption")
|
|
360
|
+
elif modality == "image":
|
|
361
|
+
content = _format_image_input_string(row.get("content"))
|
|
362
|
+
elif modality == "text_image":
|
|
363
|
+
if subtype == "page_image":
|
|
364
|
+
text = row.get("image_metadata", {}).get("text")
|
|
365
|
+
else:
|
|
366
|
+
text = row.get("image_metadata", {}).get("caption")
|
|
367
|
+
image = row.get("content")
|
|
368
|
+
content = _format_text_image_pair_input_string(text, image)
|
|
315
369
|
|
|
370
|
+
if subtype == "page_image":
|
|
371
|
+
# A workaround to save memory for full page images.
|
|
372
|
+
row["content"] = ""
|
|
316
373
|
|
|
317
|
-
|
|
374
|
+
return content
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _get_pandas_audio_content(row, modality="text"):
|
|
318
378
|
"""
|
|
319
379
|
A pandas UDF used to select extracted audio transcription to be used to create embeddings.
|
|
320
380
|
"""
|
|
@@ -408,6 +468,23 @@ def _concatenate_extractions_pandas(
|
|
|
408
468
|
# ------------------------------------------------------------------------------
|
|
409
469
|
|
|
410
470
|
|
|
471
|
+
def does_model_support_multimodal_embeddings(model: str) -> bool:
|
|
472
|
+
"""
|
|
473
|
+
Checks if a given model supports multi-modal embeddings.
|
|
474
|
+
|
|
475
|
+
Parameters
|
|
476
|
+
----------
|
|
477
|
+
model : str
|
|
478
|
+
The name of the model.
|
|
479
|
+
|
|
480
|
+
Returns
|
|
481
|
+
-------
|
|
482
|
+
bool
|
|
483
|
+
True if the model supports multi-modal embeddings, False otherwise.
|
|
484
|
+
"""
|
|
485
|
+
return model in MULTI_MODAL_MODELS
|
|
486
|
+
|
|
487
|
+
|
|
411
488
|
def transform_create_text_embeddings_internal(
|
|
412
489
|
df_transform_ledger: pd.DataFrame,
|
|
413
490
|
task_config: Dict[str, Any],
|
|
@@ -460,6 +537,15 @@ def transform_create_text_embeddings_internal(
|
|
|
460
537
|
ContentTypeEnum.AUDIO: _get_pandas_audio_content,
|
|
461
538
|
ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
|
|
462
539
|
}
|
|
540
|
+
task_type_to_modality = {
|
|
541
|
+
ContentTypeEnum.TEXT: task_config.get("text_elements_modality") or transform_config.text_elements_modality,
|
|
542
|
+
ContentTypeEnum.STRUCTURED: (
|
|
543
|
+
task_config.get("structured_elements_modality") or transform_config.structured_elements_modality
|
|
544
|
+
),
|
|
545
|
+
ContentTypeEnum.IMAGE: task_config.get("image_elements_modality") or transform_config.image_elements_modality,
|
|
546
|
+
ContentTypeEnum.AUDIO: task_config.get("audio_elements_modality") or transform_config.audio_elements_modality,
|
|
547
|
+
ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
|
|
548
|
+
}
|
|
463
549
|
|
|
464
550
|
def _content_type_getter(row):
|
|
465
551
|
return row["content_metadata"]["type"]
|
|
@@ -480,7 +566,7 @@ def transform_create_text_embeddings_internal(
|
|
|
480
566
|
# Extract content and normalize empty or non-str to None
|
|
481
567
|
extracted_content = (
|
|
482
568
|
df_content["metadata"]
|
|
483
|
-
.apply(content_getter)
|
|
569
|
+
.apply(partial(content_getter, modality=task_type_to_modality[content_type]))
|
|
484
570
|
.apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
|
|
485
571
|
)
|
|
486
572
|
df_content["_content"] = extracted_content
|
|
@@ -488,9 +574,15 @@ def transform_create_text_embeddings_internal(
|
|
|
488
574
|
# Prepare batches for only valid (non-None) content
|
|
489
575
|
valid_content_mask = df_content["_content"].notna()
|
|
490
576
|
if valid_content_mask.any():
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
577
|
+
filtered_content_list = df_content.loc[valid_content_mask, "_content"].tolist()
|
|
578
|
+
filtered_content_batches = _generate_batches(filtered_content_list, batch_size=transform_config.batch_size)
|
|
579
|
+
|
|
580
|
+
if model_name in MULTI_MODAL_MODELS:
|
|
581
|
+
modality_list = [task_type_to_modality[content_type]] * len(filtered_content_list)
|
|
582
|
+
modality_batches = _generate_batches(modality_list, batch_size=transform_config.batch_size)
|
|
583
|
+
else:
|
|
584
|
+
modality_batches = None
|
|
585
|
+
|
|
494
586
|
content_embeddings = _async_runner(
|
|
495
587
|
filtered_content_batches,
|
|
496
588
|
api_key,
|
|
@@ -500,6 +592,7 @@ def transform_create_text_embeddings_internal(
|
|
|
500
592
|
transform_config.input_type,
|
|
501
593
|
transform_config.truncate,
|
|
502
594
|
False,
|
|
595
|
+
modalities=modality_batches,
|
|
503
596
|
)
|
|
504
597
|
# Build a simple row index -> embedding map
|
|
505
598
|
embeddings_dict = dict(
|
|
@@ -141,14 +141,19 @@ def transform_text_split_and_tokenize_internal(
|
|
|
141
141
|
|
|
142
142
|
model_predownload_path = os.environ.get("MODEL_PREDOWNLOAD_PATH")
|
|
143
143
|
|
|
144
|
-
if
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
144
|
+
if model_predownload_path is not None:
|
|
145
|
+
if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
|
|
146
|
+
tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
|
|
147
|
+
):
|
|
148
|
+
tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
|
|
149
|
+
elif os.path.exists(
|
|
150
|
+
os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")
|
|
151
|
+
) and (tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"):
|
|
152
|
+
tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
|
|
153
|
+
|
|
154
|
+
# Defaulto to intfloat/e5-large-unsupervised if no tokenizer predownloaded or specified
|
|
155
|
+
if tokenizer_identifier is None:
|
|
156
|
+
tokenizer_identifier = "intfloat/e5-large-unsupervised"
|
|
152
157
|
|
|
153
158
|
tokenizer_model = AutoTokenizer.from_pretrained(tokenizer_identifier, token=hf_access_token)
|
|
154
159
|
|
|
@@ -46,14 +46,14 @@ def process_yolox_graphic_elements(yolox_text_dict):
|
|
|
46
46
|
return chart_content.strip()
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
def match_bboxes(yolox_box,
|
|
49
|
+
def match_bboxes(yolox_box, ocr_boxes, already_matched=None, delta=2.0):
|
|
50
50
|
"""
|
|
51
51
|
Associates a yolox-graphic-elements box to PaddleOCR bboxes, by taking overlapping boxes.
|
|
52
52
|
Criterion is iou > max_iou / delta where max_iou is the biggest found overlap.
|
|
53
53
|
Boxes are expeceted in format (x0, y0, x1, y1)
|
|
54
54
|
Args:
|
|
55
55
|
yolox_box (np array [4]): Cached Bbox.
|
|
56
|
-
|
|
56
|
+
ocr_boxes (np array [n x 4]): PaddleOCR boxes
|
|
57
57
|
already_matched (list or None, Optional): Already matched ids to ignore.
|
|
58
58
|
delta (float, Optional): IoU delta for considering several boxes. Defaults to 2..
|
|
59
59
|
Returns:
|
|
@@ -61,10 +61,10 @@ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
|
|
|
61
61
|
"""
|
|
62
62
|
x0_1, y0_1, x1_1, y1_1 = yolox_box
|
|
63
63
|
x0_2, y0_2, x1_2, y1_2 = (
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
64
|
+
ocr_boxes[:, 0],
|
|
65
|
+
ocr_boxes[:, 1],
|
|
66
|
+
ocr_boxes[:, 2],
|
|
67
|
+
ocr_boxes[:, 3],
|
|
68
68
|
)
|
|
69
69
|
|
|
70
70
|
# Intersection
|
|
@@ -92,10 +92,10 @@ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
|
|
|
92
92
|
return matches
|
|
93
93
|
|
|
94
94
|
|
|
95
|
-
def
|
|
95
|
+
def join_yolox_graphic_elements_and_ocr_output(yolox_output, ocr_boxes, ocr_txts):
|
|
96
96
|
"""
|
|
97
97
|
Matching boxes
|
|
98
|
-
We need to associate a text to the
|
|
98
|
+
We need to associate a text to the ocr detections.
|
|
99
99
|
For each class and for each CACHED detections, we look for overlapping text bboxes
|
|
100
100
|
with IoU > max_iou / delta where max_iou is the biggest found overlap.
|
|
101
101
|
Found texts are added to the class representation, and removed from the texts to match
|
|
@@ -113,18 +113,18 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
|
|
|
113
113
|
"value_label",
|
|
114
114
|
]
|
|
115
115
|
|
|
116
|
-
|
|
117
|
-
|
|
116
|
+
ocr_txts = np.array(ocr_txts)
|
|
117
|
+
ocr_boxes = np.array(ocr_boxes)
|
|
118
118
|
|
|
119
|
-
if (
|
|
119
|
+
if (ocr_txts.size == 0) or (ocr_boxes.size == 0):
|
|
120
120
|
return {}
|
|
121
121
|
|
|
122
|
-
|
|
122
|
+
ocr_boxes = np.array(
|
|
123
123
|
[
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
124
|
+
ocr_boxes[:, :, 0].min(-1),
|
|
125
|
+
ocr_boxes[:, :, 1].min(-1),
|
|
126
|
+
ocr_boxes[:, :, 0].max(-1),
|
|
127
|
+
ocr_boxes[:, :, 1].max(-1),
|
|
128
128
|
]
|
|
129
129
|
).T
|
|
130
130
|
|
|
@@ -139,10 +139,10 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
|
|
|
139
139
|
for yolox_box in yolox_output[k]:
|
|
140
140
|
# if there's a score at the end, drop the score.
|
|
141
141
|
yolox_box = yolox_box[:4]
|
|
142
|
-
|
|
142
|
+
ocr_ids = match_bboxes(yolox_box, ocr_boxes, already_matched=already_matched, delta=4)
|
|
143
143
|
|
|
144
|
-
if len(
|
|
145
|
-
text = " ".join(
|
|
144
|
+
if len(ocr_ids) > 0:
|
|
145
|
+
text = " ".join(ocr_txts[ocr_ids].tolist())
|
|
146
146
|
texts.append(text)
|
|
147
147
|
|
|
148
148
|
processed_texts = []
|
|
@@ -161,7 +161,7 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
|
|
|
161
161
|
return results
|
|
162
162
|
|
|
163
163
|
|
|
164
|
-
def
|
|
164
|
+
def convert_ocr_response_to_psuedo_markdown(bboxes, texts):
|
|
165
165
|
if (not bboxes) or (not texts):
|
|
166
166
|
return ""
|
|
167
167
|
|
|
@@ -186,22 +186,22 @@ def convert_paddle_response_to_psuedo_markdown(bboxes, texts):
|
|
|
186
186
|
return results
|
|
187
187
|
|
|
188
188
|
|
|
189
|
-
def
|
|
190
|
-
if (not
|
|
189
|
+
def join_yolox_table_structure_and_ocr_output(yolox_cell_preds, ocr_boxes, ocr_txts):
|
|
190
|
+
if (not ocr_boxes) or (not ocr_txts):
|
|
191
191
|
return ""
|
|
192
192
|
|
|
193
|
-
|
|
194
|
-
|
|
193
|
+
ocr_boxes = np.array(ocr_boxes)
|
|
194
|
+
ocr_boxes_ = np.array(
|
|
195
195
|
[
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
196
|
+
ocr_boxes[:, :, 0].min(-1),
|
|
197
|
+
ocr_boxes[:, :, 1].min(-1),
|
|
198
|
+
ocr_boxes[:, :, 0].max(-1),
|
|
199
|
+
ocr_boxes[:, :, 1].max(-1),
|
|
200
200
|
]
|
|
201
201
|
).T
|
|
202
202
|
|
|
203
203
|
assignments = []
|
|
204
|
-
for i, (b, t) in enumerate(zip(
|
|
204
|
+
for i, (b, t) in enumerate(zip(ocr_boxes_, ocr_txts)):
|
|
205
205
|
# Find a cell
|
|
206
206
|
matches_cell = assign_boxes(b, yolox_cell_preds["cell"], delta=1)
|
|
207
207
|
cell = yolox_cell_preds["cell"][matches_cell[0]] if len(matches_cell) else b
|
|
@@ -221,7 +221,7 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
|
|
|
221
221
|
assignments.append(
|
|
222
222
|
{
|
|
223
223
|
"index": i,
|
|
224
|
-
"
|
|
224
|
+
"ocr_box": b,
|
|
225
225
|
"is_table": isinstance(col_ids, np.ndarray) and isinstance(row_ids, np.ndarray),
|
|
226
226
|
"cell_id": matches_cell[0] if len(matches_cell) else -1,
|
|
227
227
|
"cell": cell,
|
|
@@ -249,13 +249,13 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
|
|
|
249
249
|
mat = build_markdown(df_table)
|
|
250
250
|
markdown_table = display_markdown(mat, use_header=False)
|
|
251
251
|
|
|
252
|
-
all_boxes = np.stack(df_table.
|
|
252
|
+
all_boxes = np.stack(df_table.ocr_box.values)
|
|
253
253
|
table_box = np.concatenate([all_boxes[:, [0, 1]].min(0), all_boxes[:, [2, 3]].max(0)])
|
|
254
254
|
|
|
255
255
|
df_table_to_text = pd.DataFrame(
|
|
256
256
|
[
|
|
257
257
|
{
|
|
258
|
-
"
|
|
258
|
+
"ocr_box": table_box,
|
|
259
259
|
"text": markdown_table,
|
|
260
260
|
"is_table": True,
|
|
261
261
|
}
|
|
@@ -264,7 +264,7 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
|
|
|
264
264
|
# Final text representations dataframe
|
|
265
265
|
df_text = pd.concat([df_text, df_table_to_text], ignore_index=True)
|
|
266
266
|
|
|
267
|
-
df_text = df_text.rename(columns={"
|
|
267
|
+
df_text = df_text.rename(columns={"ocr_box": "box"})
|
|
268
268
|
|
|
269
269
|
# Sort by y and x
|
|
270
270
|
df_text["x"] = df_text["box"].apply(lambda x: (x[0] + x[2]) / 2)
|
|
@@ -297,12 +297,12 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
|
|
|
297
297
|
return result
|
|
298
298
|
|
|
299
299
|
|
|
300
|
-
def assign_boxes(
|
|
300
|
+
def assign_boxes(ocr_box, boxes, delta=2.0, min_overlap=0.25):
|
|
301
301
|
"""
|
|
302
|
-
Assigns the closest bounding boxes to a reference `
|
|
302
|
+
Assigns the closest bounding boxes to a reference `ocr_box` based on overlap.
|
|
303
303
|
|
|
304
304
|
Args:
|
|
305
|
-
|
|
305
|
+
ocr_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
|
|
306
306
|
boxes (numpy.ndarray): Array of candidate bounding boxes with shape (N, 4).
|
|
307
307
|
delta (float, optional): Factor for matches relative to the best overlap. Defaults to 2.0.
|
|
308
308
|
min_overlap (float, optional): Minimum required overlap for a match. Defaults to 0.25.
|
|
@@ -316,7 +316,7 @@ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
|
|
|
316
316
|
|
|
317
317
|
boxes = np.array(boxes)
|
|
318
318
|
|
|
319
|
-
x0_1, y0_1, x1_1, y1_1 =
|
|
319
|
+
x0_1, y0_1, x1_1, y1_1 = ocr_box
|
|
320
320
|
x0_2, y0_2, x1_2, y1_2 = (
|
|
321
321
|
boxes[:, 0],
|
|
322
322
|
boxes[:, 1],
|
|
@@ -331,7 +331,7 @@ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
|
|
|
331
331
|
inter_x1 = np.minimum(x1_1, x1_2)
|
|
332
332
|
inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
|
|
333
333
|
|
|
334
|
-
# Normalize by
|
|
334
|
+
# Normalize by ocr_box size
|
|
335
335
|
area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
|
|
336
336
|
ious = inter_area / (area_1 + 1e-6)
|
|
337
337
|
|
|
@@ -385,16 +385,16 @@ def merge_text_in_cell(df_cell):
|
|
|
385
385
|
Returns:
|
|
386
386
|
pandas.DataFrame: Updated DataFrame with merged text and a single bounding box.
|
|
387
387
|
"""
|
|
388
|
-
|
|
388
|
+
ocr_boxes = np.stack(df_cell["ocr_box"].values)
|
|
389
389
|
|
|
390
|
-
df_cell["x"] = (
|
|
391
|
-
df_cell["y"] = (
|
|
390
|
+
df_cell["x"] = (ocr_boxes[:, 0] - ocr_boxes[:, 0].min()) // 10
|
|
391
|
+
df_cell["y"] = (ocr_boxes[:, 1] - ocr_boxes[:, 1].min()) // 10
|
|
392
392
|
df_cell = df_cell.sort_values(["y", "x"])
|
|
393
393
|
|
|
394
394
|
text = " ".join(df_cell["text"].values.tolist())
|
|
395
395
|
df_cell["text"] = text
|
|
396
396
|
df_cell = df_cell.head(1)
|
|
397
|
-
df_cell["
|
|
397
|
+
df_cell["ocr_box"] = df_cell["cell"]
|
|
398
398
|
df_cell.drop(["x", "y"], axis=1, inplace=True)
|
|
399
399
|
|
|
400
400
|
return df_cell
|
|
@@ -447,3 +447,58 @@ def display_markdown(
|
|
|
447
447
|
markdown_table = "\n".join("| " + " | ".join(row) + " |" for row in data)
|
|
448
448
|
|
|
449
449
|
return markdown_table
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def reorder_boxes(boxes, texts, confs, mode="top_left", dbscan_eps=10):
|
|
453
|
+
"""
|
|
454
|
+
Reorders the boxes in reading order.
|
|
455
|
+
If mode is "center", the boxes are reordered using bbox center.
|
|
456
|
+
If mode is "top_left", the boxes are reordered using the top left corner.
|
|
457
|
+
If dbscan_eps is not 0, the boxes are reordered using DBSCAN clustering.
|
|
458
|
+
|
|
459
|
+
Args:
|
|
460
|
+
boxes (np array [n x 4 x 2]): The bounding boxes of the OCR results.
|
|
461
|
+
texts (np array [n]): The text of the OCR results.
|
|
462
|
+
confs (np array [n]): The confidence scores of the OCR results.
|
|
463
|
+
mode (str, optional): The mode to reorder the boxes. Defaults to "center".
|
|
464
|
+
dbscan_eps (float, optional): The epsilon parameter for DBSCAN. Defaults to 10.
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
List[List[int, ...]]: The reordered bounding boxes.
|
|
468
|
+
List[str]: The reordered texts.
|
|
469
|
+
List[float]: The reordered confidence scores.
|
|
470
|
+
"""
|
|
471
|
+
df = pd.DataFrame(
|
|
472
|
+
[[b, t, c] for b, t, c in zip(boxes, texts, confs)],
|
|
473
|
+
columns=["bbox", "text", "conf"],
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
if mode == "center":
|
|
477
|
+
df["x"] = df["bbox"].apply(lambda box: (box[0][0] + box[2][0]) / 2)
|
|
478
|
+
df["y"] = df["bbox"].apply(lambda box: (box[0][1] + box[2][1]) / 2)
|
|
479
|
+
elif mode == "top_left":
|
|
480
|
+
df["x"] = df["bbox"].apply(lambda box: (box[0][0]))
|
|
481
|
+
df["y"] = df["bbox"].apply(lambda box: (box[0][1]))
|
|
482
|
+
|
|
483
|
+
if dbscan_eps:
|
|
484
|
+
do_naive_sorting = False
|
|
485
|
+
try:
|
|
486
|
+
dbscan = DBSCAN(eps=dbscan_eps, min_samples=1)
|
|
487
|
+
dbscan.fit(df["y"].values[:, None])
|
|
488
|
+
df["cluster"] = dbscan.labels_
|
|
489
|
+
df["cluster_centers"] = df.groupby("cluster")["y"].transform("mean").astype(int)
|
|
490
|
+
df = df.sort_values(["cluster_centers", "x"], ascending=[True, True], ignore_index=True)
|
|
491
|
+
except ValueError:
|
|
492
|
+
do_naive_sorting = True
|
|
493
|
+
else:
|
|
494
|
+
do_naive_sorting = True
|
|
495
|
+
|
|
496
|
+
if do_naive_sorting:
|
|
497
|
+
df["y"] = np.round((df["y"] - df["y"].min()) // 5, 0)
|
|
498
|
+
df = df.sort_values(["y", "x"], ascending=[True, True], ignore_index=True)
|
|
499
|
+
|
|
500
|
+
bboxes = df["bbox"].values.tolist()
|
|
501
|
+
texts = df["text"].values.tolist()
|
|
502
|
+
confs = df["conf"].values.tolist()
|
|
503
|
+
|
|
504
|
+
return bboxes, texts, confs
|