docling-ibm-models 3.4.1__py3-none-any.whl → 3.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -468,112 +468,77 @@ class MatchingPostProcessor:
468
468
  return table_cells
469
469
 
470
470
  def _align_table_cells_to_pdf(self, table_cells, pdf_cells, matches):
471
- r"""
472
- USED in 8.a step
473
- NOT USED in 6. step
474
-
475
- Align table cell bboxes with good matches
476
- to encapsulate matching pdf cells
477
-
478
- Parameters
479
- ----------
480
- table_cells : list of dict
481
- Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
482
- pdf_cells : list of dict
483
- List of PDF cells as defined by Docling
484
- matches : dictionary of lists of table_cells
485
- A dictionary which is indexed by the pdf_cell_id as key and the value is a list
486
- of the table_cells that fall inside that pdf cell
487
-
488
- Returns
489
- -------
490
- clean_table_cells : list of dict
491
- Aligned and cleaned table cells
492
471
  """
493
- # 6
494
- # align table cells with matching pdf cells
495
- new_table_cells = []
496
-
497
- for pdf_cell_id in matches:
498
- match_list = matches[pdf_cell_id]
499
- one_table_cells = []
500
- for i in range(len(match_list)):
501
- otc = int(match_list[i]["table_cell_id"])
502
- if otc not in one_table_cells:
503
- one_table_cells.append(otc)
504
-
505
- # Get bbox of pdf_cell:
506
- pdf_cell_bbox = []
507
- for pdf_cell in pdf_cells:
508
- if pdf_cell["id"] == int(pdf_cell_id):
509
- pdf_cell_bbox = pdf_cell["bbox"]
510
-
511
- # Get bbox of pdf_cell:
512
- for table_cell in table_cells:
513
- if table_cell["cell_id"] in one_table_cells:
514
- # Align bbox vertically to cover PDF cell
515
- new_bbox = [
516
- pdf_cell_bbox[0],
517
- pdf_cell_bbox[1],
518
- pdf_cell_bbox[2],
519
- pdf_cell_bbox[3],
520
- ]
521
- # We are sure cell is not empty,
522
- # because we assign PDF cell to it
523
- new_table_cell_class = "2"
524
-
525
- if "cell_class" in table_cell:
526
- new_table_cell_class = table_cell["cell_class"]
527
-
528
- new_table_cell = {
529
- "bbox": new_bbox,
530
- "cell_id": table_cell["cell_id"],
531
- "column_id": table_cell["column_id"],
532
- "label": table_cell["label"],
533
- "row_id": table_cell["row_id"],
534
- "cell_class": new_table_cell_class,
535
- }
472
+ Align table cell bboxes with good matches to encapsulate matching pdf cells
473
+ """
474
+ pdf_cell_dict = {pdf_cell["id"]: pdf_cell["bbox"] for pdf_cell in pdf_cells}
475
+ table_cell_dict = {cell["cell_id"]: cell for cell in table_cells}
536
476
 
537
- if "colspan_val" in table_cell:
538
- new_table_cell["colspan_val"] = table_cell["colspan_val"]
539
- if "rowspan_val" in table_cell:
540
- new_table_cell["rowspan_val"] = table_cell["rowspan_val"]
541
- new_table_cells.append(new_table_cell)
477
+ # Track unique cells we're going to add
478
+ processed_cells = set()
542
479
 
543
- # Rebuild table_cells list deduplicating repeating cells,
544
- # encapsulating all duplicate cells dimensions
480
+ # First pass - create initial new_table_cells with aligned bboxes
481
+ new_table_cells = []
545
482
 
546
- for new_table_cell in new_table_cells:
547
- cell_id_to_find = new_table_cell["cell_id"]
483
+ for pdf_cell_id, match_list in matches.items():
484
+ # Extract unique table cell ids from match_list
485
+ table_cell_ids = set(int(match["table_cell_id"]) for match in match_list)
548
486
 
549
- x1s = []
550
- y1s = []
551
- x2s = []
552
- y2s = []
487
+ # Get bbox of pdf_cell
488
+ pdf_cell_bbox = pdf_cell_dict.get(int(pdf_cell_id))
489
+ if not pdf_cell_bbox:
490
+ continue
553
491
 
554
- found = 0
492
+ # Process each unique table cell
493
+ for cell_id in table_cell_ids:
494
+ if cell_id in processed_cells:
495
+ continue
496
+
497
+ table_cell = table_cell_dict.get(cell_id)
498
+ if not table_cell:
499
+ continue
500
+
501
+ # Create new table cell with aligned bbox
502
+ new_table_cell = table_cell.copy()
503
+ new_table_cell["bbox"] = list(pdf_cell_bbox)
504
+
505
+ # Set cell class
506
+ if "cell_class" not in new_table_cell:
507
+ new_table_cell["cell_class"] = "2"
508
+
509
+ new_table_cells.append(new_table_cell)
510
+ processed_cells.add(cell_id)
511
+
512
+ # Second pass - aggregate bboxes for duplicate cells
513
+ cell_to_bboxes = {}
514
+ for cell in new_table_cells:
515
+ cell_id = cell["cell_id"]
516
+ if cell_id not in cell_to_bboxes:
517
+ cell_to_bboxes[cell_id] = []
518
+ cell_to_bboxes[cell_id].append(cell["bbox"])
519
+
520
+ # Create final clean table cells
521
+ clean_table_cells = []
522
+ processed_ids = set()
523
+
524
+ for cell in new_table_cells:
525
+ cell_id = cell["cell_id"]
526
+ if cell_id in processed_ids:
527
+ continue
555
528
 
556
- for found_cell in new_table_cells:
557
- if found_cell["cell_id"] == cell_id_to_find:
558
- found += 1
559
- x1s.append(found_cell["bbox"][0])
560
- y1s.append(found_cell["bbox"][1])
561
- x2s.append(found_cell["bbox"][2])
562
- y2s.append(found_cell["bbox"][3])
529
+ bboxes = cell_to_bboxes[cell_id]
530
+ if len(bboxes) > 1:
531
+ # Merge bboxes
532
+ x1s = [bbox[0] for bbox in bboxes]
533
+ y1s = [bbox[1] for bbox in bboxes]
534
+ x2s = [bbox[2] for bbox in bboxes]
535
+ y2s = [bbox[3] for bbox in bboxes]
563
536
 
564
- min_x1 = min(x1s)
565
- min_y1 = min(y1s)
566
- max_x2 = max(x2s)
567
- max_y2 = max(y2s)
537
+ cell["bbox"] = [min(x1s), min(y1s), max(x2s), max(y2s)]
568
538
 
569
- if found > 1:
570
- new_table_cell["bbox"] = [min_x1, min_y1, max_x2, max_y2]
539
+ clean_table_cells.append(cell)
540
+ processed_ids.add(cell_id)
571
541
 
572
- clean_table_cells = [
573
- i
574
- for n, i in enumerate(new_table_cells)
575
- if i not in new_table_cells[n + 1 :]
576
- ]
577
542
  return clean_table_cells
578
543
 
579
544
  def _deduplicate_cells(self, tab_columns, table_cells, iou_matches, ioc_matches):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-ibm-models
3
- Version: 3.4.1
3
+ Version: 3.4.2
4
4
  Summary: This package contains the AI models used by the Docling PDF conversion package
5
5
  License: MIT
6
6
  Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
@@ -16,7 +16,7 @@ docling_ibm_models/tableformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
16
16
  docling_ibm_models/tableformer/common.py,sha256=2zgGZBFf4fXytEaXrZR2NU6FWdX2kxO0DHlGZmuvpNQ,3230
17
17
  docling_ibm_models/tableformer/data_management/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  docling_ibm_models/tableformer/data_management/functional.py,sha256=kJntHEXFz2SP7obEcHyjAqZNZC9qh-U75MwUJALLADI,3143
19
- docling_ibm_models/tableformer/data_management/matching_post_processor.py,sha256=meSM0jLWNLS8P95QjN6pEp095jFEbKdl9KKfRY1ocy0,58046
19
+ docling_ibm_models/tableformer/data_management/matching_post_processor.py,sha256=_MVbsm0l5aKP3ChvKhXFeZ2Gz_DHGLlyMbqbKTan_MU,56721
20
20
  docling_ibm_models/tableformer/data_management/tf_cell_matcher.py,sha256=IdZTaWIRhPpyEwzZgCmviZnYacR6kbcUqBvx7ilmkKY,21250
21
21
  docling_ibm_models/tableformer/data_management/tf_predictor.py,sha256=BHd6KdAX0-b9TbX01m0872MO10zWDMValyf4UTIRkAU,39008
22
22
  docling_ibm_models/tableformer/data_management/transforms.py,sha256=NNaz_7GI7FCVmu_rJuenqH5VfzRSljJHUHpNQQ8Mq3Q,2983
@@ -34,7 +34,7 @@ docling_ibm_models/tableformer/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeu
34
34
  docling_ibm_models/tableformer/utils/app_profiler.py,sha256=Pb7o1zcikKXh7ninaNt4_nVa1xuUrogZxbTr6U6jkEE,8392
35
35
  docling_ibm_models/tableformer/utils/mem_monitor.py,sha256=NFZUnrfLThXNZQrm3ESRmPSJmPF2J1z3E2v_72O4dRw,6408
36
36
  docling_ibm_models/tableformer/utils/utils.py,sha256=8Bxf1rEn977lFbY9NX0r5xh9PvxIRipQZX_EZW92XfA,10980
37
- docling_ibm_models-3.4.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
38
- docling_ibm_models-3.4.1.dist-info/METADATA,sha256=b-QvjR6ePrwKDUoE-ZR9JSyfQsiGMuKuQUtBt1YTHXc,7434
39
- docling_ibm_models-3.4.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
40
- docling_ibm_models-3.4.1.dist-info/RECORD,,
37
+ docling_ibm_models-3.4.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
38
+ docling_ibm_models-3.4.2.dist-info/METADATA,sha256=AC30CNriUSKcviE24Sn1eIEkrwvXzoM5jiP7ImYR4VU,7434
39
+ docling_ibm_models-3.4.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
40
+ docling_ibm_models-3.4.2.dist-info/RECORD,,