nv-ingest-api 2025.7.15.dev20250715__py3-none-any.whl → 2025.7.17.dev20250717__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (28) hide show
  1. nv_ingest_api/interface/extract.py +18 -18
  2. nv_ingest_api/internal/enums/common.py +6 -0
  3. nv_ingest_api/internal/extract/image/chart_extractor.py +75 -55
  4. nv_ingest_api/internal/extract/image/infographic_extractor.py +59 -35
  5. nv_ingest_api/internal/extract/image/table_extractor.py +81 -63
  6. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +7 -7
  7. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +32 -20
  8. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +32 -9
  9. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +58 -0
  10. nv_ingest_api/internal/primitives/nim/model_interface/{paddle.py → ocr.py} +132 -39
  11. nv_ingest_api/internal/primitives/nim/nim_client.py +46 -11
  12. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +6 -6
  13. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +6 -6
  14. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +5 -5
  15. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +5 -0
  16. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +4 -0
  17. nv_ingest_api/internal/transform/embed_text.py +103 -12
  18. nv_ingest_api/internal/transform/split_text.py +13 -8
  19. nv_ingest_api/util/image_processing/table_and_chart.py +97 -42
  20. nv_ingest_api/util/image_processing/transforms.py +19 -5
  21. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +1 -1
  22. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +51 -48
  23. nv_ingest_api/util/metadata/aggregators.py +4 -1
  24. {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/METADATA +1 -1
  25. {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/RECORD +28 -28
  26. {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/WHEEL +0 -0
  27. {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/licenses/LICENSE +0 -0
  28. {nv_ingest_api-2025.7.15.dev20250715.dist-info → nv_ingest_api-2025.7.17.dev20250717.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@
4
4
 
5
5
  import logging
6
6
  from concurrent.futures import ThreadPoolExecutor
7
+ from functools import partial
7
8
  from typing import Any, Dict, Tuple, Optional, Iterable, List
8
9
 
9
10
  import pandas as pd
@@ -19,6 +20,9 @@ from nv_ingest_api.util.schema.schema_validator import validate_schema
19
20
  logger = logging.getLogger(__name__)
20
21
 
21
22
 
23
+ MULTI_MODAL_MODELS = ["llama-3.2-nemoretriever-1b-vlm-embed-v1"]
24
+
25
+
22
26
  # ------------------------------------------------------------------------------
23
27
  # Asynchronous Embedding Requests
24
28
  # ------------------------------------------------------------------------------
@@ -33,6 +37,7 @@ def _make_async_request(
33
37
  input_type: str,
34
38
  truncate: str,
35
39
  filter_errors: bool,
40
+ modalities: Optional[List[str]] = None,
36
41
  ) -> list:
37
42
  """
38
43
  Interacts directly with the NIM embedding service to calculate embeddings for a batch of prompts.
@@ -74,11 +79,18 @@ def _make_async_request(
74
79
  base_url=embedding_nim_endpoint,
75
80
  )
76
81
 
82
+ extra_body = {
83
+ "input_type": input_type,
84
+ "truncate": truncate,
85
+ }
86
+ if modalities:
87
+ extra_body["modality"] = modalities
88
+
77
89
  resp = client.embeddings.create(
78
90
  input=prompts,
79
91
  model=embedding_model,
80
92
  encoding_format=encoding_format,
81
- extra_body={"input_type": input_type, "truncate": truncate},
93
+ extra_body=extra_body,
82
94
  )
83
95
 
84
96
  response["embedding"] = resp.data
@@ -110,6 +122,7 @@ def _async_request_handler(
110
122
  input_type: str,
111
123
  truncate: str,
112
124
  filter_errors: bool,
125
+ modalities: Optional[List[str]] = None,
113
126
  ) -> List[dict]:
114
127
  """
115
128
  Gathers calculated embedding results from the NIM embedding service concurrently.
@@ -138,6 +151,9 @@ def _async_request_handler(
138
151
  List[dict]
139
152
  A list of response dictionaries from the embedding service.
140
153
  """
154
+ if modalities is None:
155
+ modalities = [None] * len(prompts)
156
+
141
157
  with ThreadPoolExecutor() as executor:
142
158
  futures = [
143
159
  executor.submit(
@@ -150,8 +166,9 @@ def _async_request_handler(
150
166
  input_type=input_type,
151
167
  truncate=truncate,
152
168
  filter_errors=filter_errors,
169
+ modalities=modality_batch,
153
170
  )
154
- for prompt_batch in prompts
171
+ for prompt_batch, modality_batch in zip(prompts, modalities)
155
172
  ]
156
173
  results = [future.result() for future in futures]
157
174
 
@@ -167,6 +184,7 @@ def _async_runner(
167
184
  input_type: str,
168
185
  truncate: str,
169
186
  filter_errors: bool,
187
+ modalities: Optional[List[str]] = None,
170
188
  ) -> dict:
171
189
  """
172
190
  Concurrently launches all NIM embedding requests and flattens the results.
@@ -204,6 +222,7 @@ def _async_runner(
204
222
  input_type,
205
223
  truncate,
206
224
  filter_errors,
225
+ modalities=modalities,
207
226
  )
208
227
 
209
228
  flat_results = {"embeddings": [], "info_msgs": []}
@@ -263,7 +282,19 @@ def _add_embeddings(row, embeddings, info_msgs):
263
282
  return row
264
283
 
265
284
 
266
- def _get_pandas_text_content(row):
285
+ def _format_image_input_string(image_b64: Optional[str]) -> str:
286
+ if not image_b64:
287
+ return
288
+ return f"data:image/png;base64,{image_b64}"
289
+
290
+
291
+ def _format_text_image_pair_input_string(text: Optional[str], image_b64: Optional[str]) -> str:
292
+ if (not text) or (not text.strip()) or (not image_b64):
293
+ return
294
+ return f"{text.strip()} {_format_image_input_string(image_b64)}"
295
+
296
+
297
+ def _get_pandas_text_content(row, modality="text"):
267
298
  """
268
299
  Extracts text content from a DataFrame row.
269
300
 
@@ -280,7 +311,7 @@ def _get_pandas_text_content(row):
280
311
  return row["content"]
281
312
 
282
313
 
283
- def _get_pandas_table_content(row):
314
+ def _get_pandas_table_content(row, modality="text"):
284
315
  """
285
316
  Extracts table/chart content from a DataFrame row.
286
317
 
@@ -294,10 +325,19 @@ def _get_pandas_table_content(row):
294
325
  str
295
326
  The table/chart content from the row.
296
327
  """
297
- return row.get("table_metadata", {}).get("table_content")
328
+ if modality == "text":
329
+ content = row.get("table_metadata", {}).get("table_content")
330
+ elif modality == "image":
331
+ content = _format_image_input_string(row.get("content"))
332
+ elif modality == "text_image":
333
+ text = row.get("table_metadata", {}).get("table_content")
334
+ image = row.get("content")
335
+ content = _format_text_image_pair_input_string(text, image)
336
+
337
+ return content
298
338
 
299
339
 
300
- def _get_pandas_image_content(row):
340
+ def _get_pandas_image_content(row, modality="text"):
301
341
  """
302
342
  Extracts image caption content from a DataFrame row.
303
343
 
@@ -311,10 +351,28 @@ def _get_pandas_image_content(row):
311
351
  str
312
352
  The image caption from the row.
313
353
  """
314
- return row.get("image_metadata", {}).get("caption")
354
+ subtype = row.get("content_metadata", {}).get("subtype")
355
+ if modality == "text":
356
+ if subtype == "page_image":
357
+ content = row.get("image_metadata", {}).get("text")
358
+ else:
359
+ content = row.get("image_metadata", {}).get("caption")
360
+ elif modality == "image":
361
+ content = _format_image_input_string(row.get("content"))
362
+ elif modality == "text_image":
363
+ if subtype == "page_image":
364
+ text = row.get("image_metadata", {}).get("text")
365
+ else:
366
+ text = row.get("image_metadata", {}).get("caption")
367
+ image = row.get("content")
368
+ content = _format_text_image_pair_input_string(text, image)
315
369
 
370
+ # A workaround to save memory.
371
+ row["content"] = ""
372
+ return content
316
373
 
317
- def _get_pandas_audio_content(row):
374
+
375
+ def _get_pandas_audio_content(row, modality="text"):
318
376
  """
319
377
  A pandas UDF used to select extracted audio transcription to be used to create embeddings.
320
378
  """
@@ -408,6 +466,23 @@ def _concatenate_extractions_pandas(
408
466
  # ------------------------------------------------------------------------------
409
467
 
410
468
 
469
+ def does_model_support_multimodal_embeddings(model: str) -> bool:
470
+ """
471
+ Checks if a given model supports multi-modal embeddings.
472
+
473
+ Parameters
474
+ ----------
475
+ model : str
476
+ The name of the model.
477
+
478
+ Returns
479
+ -------
480
+ bool
481
+ True if the model supports multi-modal embeddings, False otherwise.
482
+ """
483
+ return model in MULTI_MODAL_MODELS
484
+
485
+
411
486
  def transform_create_text_embeddings_internal(
412
487
  df_transform_ledger: pd.DataFrame,
413
488
  task_config: Dict[str, Any],
@@ -460,6 +535,15 @@ def transform_create_text_embeddings_internal(
460
535
  ContentTypeEnum.AUDIO: _get_pandas_audio_content,
461
536
  ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
462
537
  }
538
+ task_type_to_modality = {
539
+ ContentTypeEnum.TEXT: task_config.get("text_elements_modality") or transform_config.text_elements_modality,
540
+ ContentTypeEnum.STRUCTURED: (
541
+ task_config.get("structured_elements_modality") or transform_config.structured_elements_modality
542
+ ),
543
+ ContentTypeEnum.IMAGE: task_config.get("image_elements_modality") or transform_config.image_elements_modality,
544
+ ContentTypeEnum.AUDIO: task_config.get("audio_elements_modality") or transform_config.audio_elements_modality,
545
+ ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
546
+ }
463
547
 
464
548
  def _content_type_getter(row):
465
549
  return row["content_metadata"]["type"]
@@ -480,7 +564,7 @@ def transform_create_text_embeddings_internal(
480
564
  # Extract content and normalize empty or non-str to None
481
565
  extracted_content = (
482
566
  df_content["metadata"]
483
- .apply(content_getter)
567
+ .apply(partial(content_getter, modality=task_type_to_modality[content_type]))
484
568
  .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
485
569
  )
486
570
  df_content["_content"] = extracted_content
@@ -488,9 +572,15 @@ def transform_create_text_embeddings_internal(
488
572
  # Prepare batches for only valid (non-None) content
489
573
  valid_content_mask = df_content["_content"].notna()
490
574
  if valid_content_mask.any():
491
- filtered_content_batches = _generate_batches(
492
- df_content.loc[valid_content_mask, "_content"].tolist(), batch_size=transform_config.batch_size
493
- )
575
+ filtered_content_list = df_content.loc[valid_content_mask, "_content"].tolist()
576
+ filtered_content_batches = _generate_batches(filtered_content_list, batch_size=transform_config.batch_size)
577
+
578
+ if model_name in MULTI_MODAL_MODELS:
579
+ modality_list = [task_type_to_modality[content_type]] * len(filtered_content_list)
580
+ modality_batches = _generate_batches(modality_list, batch_size=transform_config.batch_size)
581
+ else:
582
+ modality_batches = None
583
+
494
584
  content_embeddings = _async_runner(
495
585
  filtered_content_batches,
496
586
  api_key,
@@ -500,6 +590,7 @@ def transform_create_text_embeddings_internal(
500
590
  transform_config.input_type,
501
591
  transform_config.truncate,
502
592
  False,
593
+ modalities=modality_batches,
503
594
  )
504
595
  # Build a simple row index -> embedding map
505
596
  embeddings_dict = dict(
@@ -141,14 +141,19 @@ def transform_text_split_and_tokenize_internal(
141
141
 
142
142
  model_predownload_path = os.environ.get("MODEL_PREDOWNLOAD_PATH")
143
143
 
144
- if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
145
- tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
146
- ):
147
- tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
148
- elif os.path.exists(os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")) and (
149
- tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"
150
- ):
151
- tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
144
+ if model_predownload_path is not None:
145
+ if os.path.exists(os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/tokenizer.json")) and (
146
+ tokenizer_identifier is None or tokenizer_identifier == "meta-llama/Llama-3.2-1B"
147
+ ):
148
+ tokenizer_identifier = os.path.join(model_predownload_path, "llama-3.2-1b/tokenizer/")
149
+ elif os.path.exists(
150
+ os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/tokenizer.json")
151
+ ) and (tokenizer_identifier is None or tokenizer_identifier == "intfloat/e5-large-unsupervised"):
152
+ tokenizer_identifier = os.path.join(model_predownload_path, "e5-large-unsupervised/tokenizer/")
153
+
154
+ # Defaulto to intfloat/e5-large-unsupervised if no tokenizer predownloaded or specified
155
+ if tokenizer_identifier is None:
156
+ tokenizer_identifier = "intfloat/e5-large-unsupervised"
152
157
 
153
158
  tokenizer_model = AutoTokenizer.from_pretrained(tokenizer_identifier, token=hf_access_token)
154
159
 
@@ -46,14 +46,14 @@ def process_yolox_graphic_elements(yolox_text_dict):
46
46
  return chart_content.strip()
47
47
 
48
48
 
49
- def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
49
+ def match_bboxes(yolox_box, ocr_boxes, already_matched=None, delta=2.0):
50
50
  """
51
51
  Associates a yolox-graphic-elements box to PaddleOCR bboxes, by taking overlapping boxes.
52
52
  Criterion is iou > max_iou / delta where max_iou is the biggest found overlap.
53
53
  Boxes are expeceted in format (x0, y0, x1, y1)
54
54
  Args:
55
55
  yolox_box (np array [4]): Cached Bbox.
56
- paddle_ocr_boxes (np array [n x 4]): PaddleOCR boxes
56
+ ocr_boxes (np array [n x 4]): PaddleOCR boxes
57
57
  already_matched (list or None, Optional): Already matched ids to ignore.
58
58
  delta (float, Optional): IoU delta for considering several boxes. Defaults to 2..
59
59
  Returns:
@@ -61,10 +61,10 @@ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
61
61
  """
62
62
  x0_1, y0_1, x1_1, y1_1 = yolox_box
63
63
  x0_2, y0_2, x1_2, y1_2 = (
64
- paddle_ocr_boxes[:, 0],
65
- paddle_ocr_boxes[:, 1],
66
- paddle_ocr_boxes[:, 2],
67
- paddle_ocr_boxes[:, 3],
64
+ ocr_boxes[:, 0],
65
+ ocr_boxes[:, 1],
66
+ ocr_boxes[:, 2],
67
+ ocr_boxes[:, 3],
68
68
  )
69
69
 
70
70
  # Intersection
@@ -92,10 +92,10 @@ def match_bboxes(yolox_box, paddle_ocr_boxes, already_matched=None, delta=2.0):
92
92
  return matches
93
93
 
94
94
 
95
- def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, paddle_txts):
95
+ def join_yolox_graphic_elements_and_ocr_output(yolox_output, ocr_boxes, ocr_txts):
96
96
  """
97
97
  Matching boxes
98
- We need to associate a text to the paddle detections.
98
+ We need to associate a text to the ocr detections.
99
99
  For each class and for each CACHED detections, we look for overlapping text bboxes
100
100
  with IoU > max_iou / delta where max_iou is the biggest found overlap.
101
101
  Found texts are added to the class representation, and removed from the texts to match
@@ -113,18 +113,18 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
113
113
  "value_label",
114
114
  ]
115
115
 
116
- paddle_txts = np.array(paddle_txts)
117
- paddle_boxes = np.array(paddle_boxes)
116
+ ocr_txts = np.array(ocr_txts)
117
+ ocr_boxes = np.array(ocr_boxes)
118
118
 
119
- if (paddle_txts.size == 0) or (paddle_boxes.size == 0):
119
+ if (ocr_txts.size == 0) or (ocr_boxes.size == 0):
120
120
  return {}
121
121
 
122
- paddle_boxes = np.array(
122
+ ocr_boxes = np.array(
123
123
  [
124
- paddle_boxes[:, :, 0].min(-1),
125
- paddle_boxes[:, :, 1].min(-1),
126
- paddle_boxes[:, :, 0].max(-1),
127
- paddle_boxes[:, :, 1].max(-1),
124
+ ocr_boxes[:, :, 0].min(-1),
125
+ ocr_boxes[:, :, 1].min(-1),
126
+ ocr_boxes[:, :, 0].max(-1),
127
+ ocr_boxes[:, :, 1].max(-1),
128
128
  ]
129
129
  ).T
130
130
 
@@ -139,10 +139,10 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
139
139
  for yolox_box in yolox_output[k]:
140
140
  # if there's a score at the end, drop the score.
141
141
  yolox_box = yolox_box[:4]
142
- paddle_ids = match_bboxes(yolox_box, paddle_boxes, already_matched=already_matched, delta=4)
142
+ ocr_ids = match_bboxes(yolox_box, ocr_boxes, already_matched=already_matched, delta=4)
143
143
 
144
- if len(paddle_ids) > 0:
145
- text = " ".join(paddle_txts[paddle_ids].tolist())
144
+ if len(ocr_ids) > 0:
145
+ text = " ".join(ocr_txts[ocr_ids].tolist())
146
146
  texts.append(text)
147
147
 
148
148
  processed_texts = []
@@ -161,7 +161,7 @@ def join_yolox_graphic_elements_and_paddle_output(yolox_output, paddle_boxes, pa
161
161
  return results
162
162
 
163
163
 
164
- def convert_paddle_response_to_psuedo_markdown(bboxes, texts):
164
+ def convert_ocr_response_to_psuedo_markdown(bboxes, texts):
165
165
  if (not bboxes) or (not texts):
166
166
  return ""
167
167
 
@@ -186,22 +186,22 @@ def convert_paddle_response_to_psuedo_markdown(bboxes, texts):
186
186
  return results
187
187
 
188
188
 
189
- def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_boxes, paddle_ocr_txts):
190
- if (not paddle_ocr_boxes) or (not paddle_ocr_txts):
189
+ def join_yolox_table_structure_and_ocr_output(yolox_cell_preds, ocr_boxes, ocr_txts):
190
+ if (not ocr_boxes) or (not ocr_txts):
191
191
  return ""
192
192
 
193
- paddle_ocr_boxes = np.array(paddle_ocr_boxes)
194
- paddle_ocr_boxes_ = np.array(
193
+ ocr_boxes = np.array(ocr_boxes)
194
+ ocr_boxes_ = np.array(
195
195
  [
196
- paddle_ocr_boxes[:, :, 0].min(-1),
197
- paddle_ocr_boxes[:, :, 1].min(-1),
198
- paddle_ocr_boxes[:, :, 0].max(-1),
199
- paddle_ocr_boxes[:, :, 1].max(-1),
196
+ ocr_boxes[:, :, 0].min(-1),
197
+ ocr_boxes[:, :, 1].min(-1),
198
+ ocr_boxes[:, :, 0].max(-1),
199
+ ocr_boxes[:, :, 1].max(-1),
200
200
  ]
201
201
  ).T
202
202
 
203
203
  assignments = []
204
- for i, (b, t) in enumerate(zip(paddle_ocr_boxes_, paddle_ocr_txts)):
204
+ for i, (b, t) in enumerate(zip(ocr_boxes_, ocr_txts)):
205
205
  # Find a cell
206
206
  matches_cell = assign_boxes(b, yolox_cell_preds["cell"], delta=1)
207
207
  cell = yolox_cell_preds["cell"][matches_cell[0]] if len(matches_cell) else b
@@ -221,7 +221,7 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
221
221
  assignments.append(
222
222
  {
223
223
  "index": i,
224
- "paddle_box": b,
224
+ "ocr_box": b,
225
225
  "is_table": isinstance(col_ids, np.ndarray) and isinstance(row_ids, np.ndarray),
226
226
  "cell_id": matches_cell[0] if len(matches_cell) else -1,
227
227
  "cell": cell,
@@ -249,13 +249,13 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
249
249
  mat = build_markdown(df_table)
250
250
  markdown_table = display_markdown(mat, use_header=False)
251
251
 
252
- all_boxes = np.stack(df_table.paddle_box.values)
252
+ all_boxes = np.stack(df_table.ocr_box.values)
253
253
  table_box = np.concatenate([all_boxes[:, [0, 1]].min(0), all_boxes[:, [2, 3]].max(0)])
254
254
 
255
255
  df_table_to_text = pd.DataFrame(
256
256
  [
257
257
  {
258
- "paddle_box": table_box,
258
+ "ocr_box": table_box,
259
259
  "text": markdown_table,
260
260
  "is_table": True,
261
261
  }
@@ -264,7 +264,7 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
264
264
  # Final text representations dataframe
265
265
  df_text = pd.concat([df_text, df_table_to_text], ignore_index=True)
266
266
 
267
- df_text = df_text.rename(columns={"paddle_box": "box"})
267
+ df_text = df_text.rename(columns={"ocr_box": "box"})
268
268
 
269
269
  # Sort by y and x
270
270
  df_text["x"] = df_text["box"].apply(lambda x: (x[0] + x[2]) / 2)
@@ -297,12 +297,12 @@ def join_yolox_table_structure_and_paddle_output(yolox_cell_preds, paddle_ocr_bo
297
297
  return result
298
298
 
299
299
 
300
- def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
300
+ def assign_boxes(ocr_box, boxes, delta=2.0, min_overlap=0.25):
301
301
  """
302
- Assigns the closest bounding boxes to a reference `paddle_box` based on overlap.
302
+ Assigns the closest bounding boxes to a reference `ocr_box` based on overlap.
303
303
 
304
304
  Args:
305
- paddle_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
305
+ ocr_box (list or numpy.ndarray): Reference bounding box [x_min, y_min, x_max, y_max].
306
306
  boxes (numpy.ndarray): Array of candidate bounding boxes with shape (N, 4).
307
307
  delta (float, optional): Factor for matches relative to the best overlap. Defaults to 2.0.
308
308
  min_overlap (float, optional): Minimum required overlap for a match. Defaults to 0.25.
@@ -316,7 +316,7 @@ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
316
316
 
317
317
  boxes = np.array(boxes)
318
318
 
319
- x0_1, y0_1, x1_1, y1_1 = paddle_box
319
+ x0_1, y0_1, x1_1, y1_1 = ocr_box
320
320
  x0_2, y0_2, x1_2, y1_2 = (
321
321
  boxes[:, 0],
322
322
  boxes[:, 1],
@@ -331,7 +331,7 @@ def assign_boxes(paddle_box, boxes, delta=2.0, min_overlap=0.25):
331
331
  inter_x1 = np.minimum(x1_1, x1_2)
332
332
  inter_area = np.maximum(0, inter_y1 - inter_y0) * np.maximum(0, inter_x1 - inter_x0)
333
333
 
334
- # Normalize by paddle_box size
334
+ # Normalize by ocr_box size
335
335
  area_1 = (y1_1 - y0_1) * (x1_1 - x0_1)
336
336
  ious = inter_area / (area_1 + 1e-6)
337
337
 
@@ -385,16 +385,16 @@ def merge_text_in_cell(df_cell):
385
385
  Returns:
386
386
  pandas.DataFrame: Updated DataFrame with merged text and a single bounding box.
387
387
  """
388
- paddle_boxes = np.stack(df_cell["paddle_box"].values)
388
+ ocr_boxes = np.stack(df_cell["ocr_box"].values)
389
389
 
390
- df_cell["x"] = (paddle_boxes[:, 0] - paddle_boxes[:, 0].min()) // 10
391
- df_cell["y"] = (paddle_boxes[:, 1] - paddle_boxes[:, 1].min()) // 10
390
+ df_cell["x"] = (ocr_boxes[:, 0] - ocr_boxes[:, 0].min()) // 10
391
+ df_cell["y"] = (ocr_boxes[:, 1] - ocr_boxes[:, 1].min()) // 10
392
392
  df_cell = df_cell.sort_values(["y", "x"])
393
393
 
394
394
  text = " ".join(df_cell["text"].values.tolist())
395
395
  df_cell["text"] = text
396
396
  df_cell = df_cell.head(1)
397
- df_cell["paddle_box"] = df_cell["cell"]
397
+ df_cell["ocr_box"] = df_cell["cell"]
398
398
  df_cell.drop(["x", "y"], axis=1, inplace=True)
399
399
 
400
400
  return df_cell
@@ -447,3 +447,58 @@ def display_markdown(
447
447
  markdown_table = "\n".join("| " + " | ".join(row) + " |" for row in data)
448
448
 
449
449
  return markdown_table
450
+
451
+
452
+ def reorder_boxes(boxes, texts, confs, mode="top_left", dbscan_eps=10):
453
+ """
454
+ Reorders the boxes in reading order.
455
+ If mode is "center", the boxes are reordered using bbox center.
456
+ If mode is "top_left", the boxes are reordered using the top left corner.
457
+ If dbscan_eps is not 0, the boxes are reordered using DBSCAN clustering.
458
+
459
+ Args:
460
+ boxes (np array [n x 4 x 2]): The bounding boxes of the OCR results.
461
+ texts (np array [n]): The text of the OCR results.
462
+ confs (np array [n]): The confidence scores of the OCR results.
463
+ mode (str, optional): The mode to reorder the boxes. Defaults to "center".
464
+ dbscan_eps (float, optional): The epsilon parameter for DBSCAN. Defaults to 10.
465
+
466
+ Returns:
467
+ List[List[int, ...]]: The reordered bounding boxes.
468
+ List[str]: The reordered texts.
469
+ List[float]: The reordered confidence scores.
470
+ """
471
+ df = pd.DataFrame(
472
+ [[b, t, c] for b, t, c in zip(boxes, texts, confs)],
473
+ columns=["bbox", "text", "conf"],
474
+ )
475
+
476
+ if mode == "center":
477
+ df["x"] = df["bbox"].apply(lambda box: (box[0][0] + box[2][0]) / 2)
478
+ df["y"] = df["bbox"].apply(lambda box: (box[0][1] + box[2][1]) / 2)
479
+ elif mode == "top_left":
480
+ df["x"] = df["bbox"].apply(lambda box: (box[0][0]))
481
+ df["y"] = df["bbox"].apply(lambda box: (box[0][1]))
482
+
483
+ if dbscan_eps:
484
+ do_naive_sorting = False
485
+ try:
486
+ dbscan = DBSCAN(eps=dbscan_eps, min_samples=1)
487
+ dbscan.fit(df["y"].values[:, None])
488
+ df["cluster"] = dbscan.labels_
489
+ df["cluster_centers"] = df.groupby("cluster")["y"].transform("mean").astype(int)
490
+ df = df.sort_values(["cluster_centers", "x"], ascending=[True, True], ignore_index=True)
491
+ except ValueError:
492
+ do_naive_sorting = True
493
+ else:
494
+ do_naive_sorting = True
495
+
496
+ if do_naive_sorting:
497
+ df["y"] = np.round((df["y"] - df["y"].min()) // 5, 0)
498
+ df = df.sort_values(["y", "x"], ascending=[True, True], ignore_index=True)
499
+
500
+ bboxes = df["bbox"].values.tolist()
501
+ texts = df["text"].values.tolist()
502
+ confs = df["conf"].values.tolist()
503
+
504
+ return bboxes, texts, confs
@@ -20,6 +20,9 @@ cv2.setNumThreads(1)
20
20
  DEFAULT_MAX_WIDTH = 1024
21
21
  DEFAULT_MAX_HEIGHT = 1280
22
22
 
23
+ # Workaround for PIL.Image.DecompressionBombError
24
+ Image.MAX_IMAGE_PIXELS = None
25
+
23
26
  logger = logging.getLogger(__name__)
24
27
 
25
28
 
@@ -206,6 +209,7 @@ def pad_image(
206
209
  target_height: int = DEFAULT_MAX_HEIGHT,
207
210
  background_color: int = 255,
208
211
  dtype=np.uint8,
212
+ how: str = "center",
209
213
  ) -> Tuple[np.ndarray, Tuple[int, int]]:
210
214
  """
211
215
  Pads a NumPy array representing an image to the specified target dimensions.
@@ -214,6 +218,8 @@ def pad_image(
214
218
  in that dimension. If the target dimensions are larger, the image will be centered within the
215
219
  canvas of the specified target size, with the remaining space filled with white padding.
216
220
 
221
+ The padding can be done around the center (how="center"), or to the bottom right (how="bottom_right").
222
+
217
223
  Parameters
218
224
  ----------
219
225
  array : np.ndarray
@@ -222,6 +228,8 @@ def pad_image(
222
228
  The desired target width of the padded image. Defaults to DEFAULT_MAX_WIDTH.
223
229
  target_height : int, optional
224
230
  The desired target height of the padded image. Defaults to DEFAULT_MAX_HEIGHT.
231
+ how : str, optional
232
+ The method to pad the image. Defaults to "center".
225
233
 
226
234
  Returns
227
235
  -------
@@ -246,17 +254,23 @@ def pad_image(
246
254
  """
247
255
  height, width = array.shape[:2]
248
256
 
249
- # Determine the padding needed, if any, while ensuring no padding is applied if the target is smaller
250
- pad_height = max((target_height - height) // 2, 0)
251
- pad_width = max((target_width - width) // 2, 0)
252
-
253
257
  # Determine final canvas size (may be equal to original if target is smaller)
254
258
  final_height = max(height, target_height)
255
259
  final_width = max(width, target_width)
256
260
 
257
261
  # Create the canvas and place the original image on it
258
262
  canvas = background_color * np.ones((final_height, final_width, array.shape[2]), dtype=dtype)
259
- canvas[pad_height : pad_height + height, pad_width : pad_width + width] = array # noqa: E203
263
+
264
+ # Determine the padding needed, if any, while ensuring no padding is applied if the target is smaller
265
+ if how == "center":
266
+ pad_height = max((target_height - height) // 2, 0)
267
+ pad_width = max((target_width - width) // 2, 0)
268
+
269
+ canvas[pad_height : pad_height + height, pad_width : pad_width + width] = array # noqa: E203
270
+ elif how == "bottom_right":
271
+ pad_height, pad_width = 0, 0
272
+
273
+ canvas[:height, :width] = array # noqa: E203
260
274
 
261
275
  return canvas, (pad_width, pad_height)
262
276
 
@@ -250,7 +250,7 @@ class SimpleMessageBrokerHandler(socketserver.BaseRequestHandler):
250
250
  with queue_lock:
251
251
  if queue.empty():
252
252
  # Return failure response immediately
253
- response = ResponseSchema(response_code=1, response_reason="Queue is empty")
253
+ response = ResponseSchema(response_code=2, response_reason="Job not ready")
254
254
  self._send_response(response)
255
255
  return
256
256
  # Pop the message from the queue