docling-ibm-models 3.4.0__tar.gz → 3.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/PKG-INFO +1 -1
  2. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/reading_order/reading_order_rb.py +2 -2
  3. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/data_management/matching_post_processor.py +61 -96
  4. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/pyproject.toml +1 -1
  5. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/LICENSE +0 -0
  6. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/README.md +0 -0
  7. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/__init__.py +0 -0
  8. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/code_formula_model/__init__.py +0 -0
  9. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/code_formula_model/code_formula_predictor.py +0 -0
  10. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/code_formula_model/models/__init__.py +0 -0
  11. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/code_formula_model/models/sam.py +0 -0
  12. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/code_formula_model/models/sam_opt.py +0 -0
  13. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/code_formula_model/models/sam_opt_image_processor.py +0 -0
  14. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/document_figure_classifier_model/__init__.py +0 -0
  15. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py +0 -0
  16. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/layoutmodel/__init__.py +0 -0
  17. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/layoutmodel/layout_predictor.py +0 -0
  18. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/py.typed +0 -0
  19. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/reading_order/__init__.py +0 -0
  20. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/__init__.py +0 -0
  21. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/common.py +0 -0
  22. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/data_management/__init__.py +0 -0
  23. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/data_management/functional.py +0 -0
  24. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/data_management/tf_cell_matcher.py +0 -0
  25. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/data_management/tf_predictor.py +0 -0
  26. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/data_management/transforms.py +0 -0
  27. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/models/__init__.py +0 -0
  28. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/models/common/__init__.py +0 -0
  29. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/models/common/base_model.py +0 -0
  30. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/models/table04_rs/__init__.py +0 -0
  31. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/models/table04_rs/bbox_decoder_rs.py +0 -0
  32. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/models/table04_rs/encoder04_rs.py +0 -0
  33. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/models/table04_rs/tablemodel04_rs.py +0 -0
  34. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py +0 -0
  35. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/otsl.py +0 -0
  36. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/settings.py +0 -0
  37. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/utils/__init__.py +0 -0
  38. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/utils/app_profiler.py +0 -0
  39. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/utils/mem_monitor.py +0 -0
  40. {docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-ibm-models
3
- Version: 3.4.0
3
+ Version: 3.4.2
4
4
  Summary: This package contains the AI models used by the Docling PDF conversion package
5
5
  License: MIT
6
6
  Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
@@ -201,8 +201,8 @@ class ReadingOrderPredictor:
201
201
  )
202
202
  ):
203
203
 
204
- m1 = re.fullmatch(".+([a-z\,\-])(\s*)", elem.text)
205
- m2 = re.fullmatch("(\s*[a-z])(.+)", sorted_elements[ind_p1].text)
204
+ m1 = re.fullmatch(r".+([a-z,\-])(\s*)", elem.text)
205
+ m2 = re.fullmatch(r"(\s*[a-z])(.+)", sorted_elements[ind_p1].text)
206
206
 
207
207
  if m1 and m2:
208
208
  merges[elem.cid] = [sorted_elements[ind_p1].cid]
@@ -468,112 +468,77 @@ class MatchingPostProcessor:
468
468
  return table_cells
469
469
 
470
470
  def _align_table_cells_to_pdf(self, table_cells, pdf_cells, matches):
471
- r"""
472
- USED in 8.a step
473
- NOT USED in 6. step
474
-
475
- Align table cell bboxes with good matches
476
- to encapsulate matching pdf cells
477
-
478
- Parameters
479
- ----------
480
- table_cells : list of dict
481
- Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
482
- pdf_cells : list of dict
483
- List of PDF cells as defined by Docling
484
- matches : dictionary of lists of table_cells
485
- A dictionary which is indexed by the pdf_cell_id as key and the value is a list
486
- of the table_cells that fall inside that pdf cell
487
-
488
- Returns
489
- -------
490
- clean_table_cells : list of dict
491
- Aligned and cleaned table cells
492
471
  """
493
- # 6
494
- # align table cells with matching pdf cells
495
- new_table_cells = []
496
-
497
- for pdf_cell_id in matches:
498
- match_list = matches[pdf_cell_id]
499
- one_table_cells = []
500
- for i in range(len(match_list)):
501
- otc = int(match_list[i]["table_cell_id"])
502
- if otc not in one_table_cells:
503
- one_table_cells.append(otc)
504
-
505
- # Get bbox of pdf_cell:
506
- pdf_cell_bbox = []
507
- for pdf_cell in pdf_cells:
508
- if pdf_cell["id"] == int(pdf_cell_id):
509
- pdf_cell_bbox = pdf_cell["bbox"]
510
-
511
- # Get bbox of pdf_cell:
512
- for table_cell in table_cells:
513
- if table_cell["cell_id"] in one_table_cells:
514
- # Align bbox vertically to cover PDF cell
515
- new_bbox = [
516
- pdf_cell_bbox[0],
517
- pdf_cell_bbox[1],
518
- pdf_cell_bbox[2],
519
- pdf_cell_bbox[3],
520
- ]
521
- # We are sure cell is not empty,
522
- # because we assign PDF cell to it
523
- new_table_cell_class = "2"
524
-
525
- if "cell_class" in table_cell:
526
- new_table_cell_class = table_cell["cell_class"]
527
-
528
- new_table_cell = {
529
- "bbox": new_bbox,
530
- "cell_id": table_cell["cell_id"],
531
- "column_id": table_cell["column_id"],
532
- "label": table_cell["label"],
533
- "row_id": table_cell["row_id"],
534
- "cell_class": new_table_cell_class,
535
- }
472
+ Align table cell bboxes with good matches to encapsulate matching pdf cells
473
+ """
474
+ pdf_cell_dict = {pdf_cell["id"]: pdf_cell["bbox"] for pdf_cell in pdf_cells}
475
+ table_cell_dict = {cell["cell_id"]: cell for cell in table_cells}
536
476
 
537
- if "colspan_val" in table_cell:
538
- new_table_cell["colspan_val"] = table_cell["colspan_val"]
539
- if "rowspan_val" in table_cell:
540
- new_table_cell["rowspan_val"] = table_cell["rowspan_val"]
541
- new_table_cells.append(new_table_cell)
477
+ # Track unique cells we're going to add
478
+ processed_cells = set()
542
479
 
543
- # Rebuild table_cells list deduplicating repeating cells,
544
- # encapsulating all duplicate cells dimensions
480
+ # First pass - create initial new_table_cells with aligned bboxes
481
+ new_table_cells = []
545
482
 
546
- for new_table_cell in new_table_cells:
547
- cell_id_to_find = new_table_cell["cell_id"]
483
+ for pdf_cell_id, match_list in matches.items():
484
+ # Extract unique table cell ids from match_list
485
+ table_cell_ids = set(int(match["table_cell_id"]) for match in match_list)
548
486
 
549
- x1s = []
550
- y1s = []
551
- x2s = []
552
- y2s = []
487
+ # Get bbox of pdf_cell
488
+ pdf_cell_bbox = pdf_cell_dict.get(int(pdf_cell_id))
489
+ if not pdf_cell_bbox:
490
+ continue
553
491
 
554
- found = 0
492
+ # Process each unique table cell
493
+ for cell_id in table_cell_ids:
494
+ if cell_id in processed_cells:
495
+ continue
496
+
497
+ table_cell = table_cell_dict.get(cell_id)
498
+ if not table_cell:
499
+ continue
500
+
501
+ # Create new table cell with aligned bbox
502
+ new_table_cell = table_cell.copy()
503
+ new_table_cell["bbox"] = list(pdf_cell_bbox)
504
+
505
+ # Set cell class
506
+ if "cell_class" not in new_table_cell:
507
+ new_table_cell["cell_class"] = "2"
508
+
509
+ new_table_cells.append(new_table_cell)
510
+ processed_cells.add(cell_id)
511
+
512
+ # Second pass - aggregate bboxes for duplicate cells
513
+ cell_to_bboxes = {}
514
+ for cell in new_table_cells:
515
+ cell_id = cell["cell_id"]
516
+ if cell_id not in cell_to_bboxes:
517
+ cell_to_bboxes[cell_id] = []
518
+ cell_to_bboxes[cell_id].append(cell["bbox"])
519
+
520
+ # Create final clean table cells
521
+ clean_table_cells = []
522
+ processed_ids = set()
523
+
524
+ for cell in new_table_cells:
525
+ cell_id = cell["cell_id"]
526
+ if cell_id in processed_ids:
527
+ continue
555
528
 
556
- for found_cell in new_table_cells:
557
- if found_cell["cell_id"] == cell_id_to_find:
558
- found += 1
559
- x1s.append(found_cell["bbox"][0])
560
- y1s.append(found_cell["bbox"][1])
561
- x2s.append(found_cell["bbox"][2])
562
- y2s.append(found_cell["bbox"][3])
529
+ bboxes = cell_to_bboxes[cell_id]
530
+ if len(bboxes) > 1:
531
+ # Merge bboxes
532
+ x1s = [bbox[0] for bbox in bboxes]
533
+ y1s = [bbox[1] for bbox in bboxes]
534
+ x2s = [bbox[2] for bbox in bboxes]
535
+ y2s = [bbox[3] for bbox in bboxes]
563
536
 
564
- min_x1 = min(x1s)
565
- min_y1 = min(y1s)
566
- max_x2 = max(x2s)
567
- max_y2 = max(y2s)
537
+ cell["bbox"] = [min(x1s), min(y1s), max(x2s), max(y2s)]
568
538
 
569
- if found > 1:
570
- new_table_cell["bbox"] = [min_x1, min_y1, max_x2, max_y2]
539
+ clean_table_cells.append(cell)
540
+ processed_ids.add(cell_id)
571
541
 
572
- clean_table_cells = [
573
- i
574
- for n, i in enumerate(new_table_cells)
575
- if i not in new_table_cells[n + 1 :]
576
- ]
577
542
  return clean_table_cells
578
543
 
579
544
  def _deduplicate_cells(self, tab_columns, table_cells, iou_matches, ioc_matches):
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-ibm-models"
3
- version = "3.4.0" # DO NOT EDIT, updated automatically
3
+ version = "3.4.2" # DO NOT EDIT, updated automatically
4
4
  description = "This package contains the AI models used by the Docling PDF conversion package"
5
5
  authors = ["Nikos Livathinos <nli@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"