PyPI - docling-ibm-models - Versions diffs - 3.4.0__tar.gz → 3.4.2__tar.gz - Mend

docling-ibm-models 3.4.0tar.gz → 3.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

{docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-ibm-models
-Version: 3.4.0
+Version: 3.4.2
 Summary: This package contains the AI models used by the Docling PDF conversion package
 License: MIT
 Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former

{docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/reading_order/reading_order_rb.py RENAMED Viewed

@@ -201,8 +201,8 @@ class ReadingOrderPredictor:
                     )
                 ):
-                    m1 = re.fullmatch(".+([a-z\,\-])(\s*)", elem.text)
-                    m2 = re.fullmatch("(\s*[a-z])(.+)", sorted_elements[ind_p1].text)
+                    m1 = re.fullmatch(r".+([a-z,\-])(\s*)", elem.text)
+                    m2 = re.fullmatch(r"(\s*[a-z])(.+)", sorted_elements[ind_p1].text)
                     if m1 and m2:
                         merges[elem.cid] = [sorted_elements[ind_p1].cid]

{docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/docling_ibm_models/tableformer/data_management/matching_post_processor.py RENAMED Viewed

@@ -468,112 +468,77 @@ class MatchingPostProcessor:
         return table_cells
     def _align_table_cells_to_pdf(self, table_cells, pdf_cells, matches):
-        r"""
-        USED in 8.a step
-        NOT USED in 6. step
-        Align table cell bboxes with good matches
-        to encapsulate matching pdf cells
-        Parameters
-        ----------
-        table_cells : list of dict
-            Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
-        pdf_cells : list of dict
-            List of PDF cells as defined by Docling
-        matches : dictionary of lists of table_cells
-            A dictionary which is indexed by the pdf_cell_id as key and the value is a list
-            of the table_cells that fall inside that pdf cell
-        Returns
-        -------
-        clean_table_cells : list of dict
-            Aligned and cleaned table cells
         """
-        # 6
-        # align table cells with matching pdf cells
-        new_table_cells = []
-        for pdf_cell_id in matches:
-            match_list = matches[pdf_cell_id]
-            one_table_cells = []
-            for i in range(len(match_list)):
-                otc = int(match_list[i]["table_cell_id"])
-                if otc not in one_table_cells:
-                    one_table_cells.append(otc)
-            # Get bbox of pdf_cell:
-            pdf_cell_bbox = []
-            for pdf_cell in pdf_cells:
-                if pdf_cell["id"] == int(pdf_cell_id):
-                    pdf_cell_bbox = pdf_cell["bbox"]
-            # Get bbox of pdf_cell:
-            for table_cell in table_cells:
-                if table_cell["cell_id"] in one_table_cells:
-                    # Align bbox vertically to cover PDF cell
-                    new_bbox = [
-                        pdf_cell_bbox[0],
-                        pdf_cell_bbox[1],
-                        pdf_cell_bbox[2],
-                        pdf_cell_bbox[3],
-                    ]
-                    # We are sure cell is not empty,
-                    # because we assign PDF cell to it
-                    new_table_cell_class = "2"
-                    if "cell_class" in table_cell:
-                        new_table_cell_class = table_cell["cell_class"]
-                    new_table_cell = {
-                        "bbox": new_bbox,
-                        "cell_id": table_cell["cell_id"],
-                        "column_id": table_cell["column_id"],
-                        "label": table_cell["label"],
-                        "row_id": table_cell["row_id"],
-                        "cell_class": new_table_cell_class,
-                    }
+        Align table cell bboxes with good matches to encapsulate matching pdf cells
+        """
+        pdf_cell_dict = {pdf_cell["id"]: pdf_cell["bbox"] for pdf_cell in pdf_cells}
+        table_cell_dict = {cell["cell_id"]: cell for cell in table_cells}
-                    if "colspan_val" in table_cell:
-                        new_table_cell["colspan_val"] = table_cell["colspan_val"]
-                    if "rowspan_val" in table_cell:
-                        new_table_cell["rowspan_val"] = table_cell["rowspan_val"]
-                    new_table_cells.append(new_table_cell)
+        # Track unique cells we're going to add
+        processed_cells = set()
-        # Rebuild table_cells list deduplicating repeating cells,
-        # encapsulating all duplicate cells dimensions
+        # First pass - create initial new_table_cells with aligned bboxes
+        new_table_cells = []
-        for new_table_cell in new_table_cells:
-            cell_id_to_find = new_table_cell["cell_id"]
+        for pdf_cell_id, match_list in matches.items():
+            # Extract unique table cell ids from match_list
+            table_cell_ids = set(int(match["table_cell_id"]) for match in match_list)
-            x1s = []
-            y1s = []
-            x2s = []
-            y2s = []
+            # Get bbox of pdf_cell
+            pdf_cell_bbox = pdf_cell_dict.get(int(pdf_cell_id))
+            if not pdf_cell_bbox:
+                continue
-            found = 0
+            # Process each unique table cell
+            for cell_id in table_cell_ids:
+                if cell_id in processed_cells:
+                    continue
+                table_cell = table_cell_dict.get(cell_id)
+                if not table_cell:
+                    continue
+                # Create new table cell with aligned bbox
+                new_table_cell = table_cell.copy()
+                new_table_cell["bbox"] = list(pdf_cell_bbox)
+                # Set cell class
+                if "cell_class" not in new_table_cell:
+                    new_table_cell["cell_class"] = "2"
+                new_table_cells.append(new_table_cell)
+                processed_cells.add(cell_id)
+        # Second pass - aggregate bboxes for duplicate cells
+        cell_to_bboxes = {}
+        for cell in new_table_cells:
+            cell_id = cell["cell_id"]
+            if cell_id not in cell_to_bboxes:
+                cell_to_bboxes[cell_id] = []
+            cell_to_bboxes[cell_id].append(cell["bbox"])
+        # Create final clean table cells
+        clean_table_cells = []
+        processed_ids = set()
+        for cell in new_table_cells:
+            cell_id = cell["cell_id"]
+            if cell_id in processed_ids:
+                continue
-            for found_cell in new_table_cells:
-                if found_cell["cell_id"] == cell_id_to_find:
-                    found += 1
-                    x1s.append(found_cell["bbox"][0])
-                    y1s.append(found_cell["bbox"][1])
-                    x2s.append(found_cell["bbox"][2])
-                    y2s.append(found_cell["bbox"][3])
+            bboxes = cell_to_bboxes[cell_id]
+            if len(bboxes) > 1:
+                # Merge bboxes
+                x1s = [bbox[0] for bbox in bboxes]
+                y1s = [bbox[1] for bbox in bboxes]
+                x2s = [bbox[2] for bbox in bboxes]
+                y2s = [bbox[3] for bbox in bboxes]
-            min_x1 = min(x1s)
-            min_y1 = min(y1s)
-            max_x2 = max(x2s)
-            max_y2 = max(y2s)
+                cell["bbox"] = [min(x1s), min(y1s), max(x2s), max(y2s)]
-            if found > 1:
-                new_table_cell["bbox"] = [min_x1, min_y1, max_x2, max_y2]
+            clean_table_cells.append(cell)
+            processed_ids.add(cell_id)
-        clean_table_cells = [
-            i
-            for n, i in enumerate(new_table_cells)
-            if i not in new_table_cells[n + 1 :]
-        ]
         return clean_table_cells
     def _deduplicate_cells(self, tab_columns, table_cells, iou_matches, ioc_matches):

{docling_ibm_models-3.4.0 → docling_ibm_models-3.4.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling-ibm-models"
-version = "3.4.0"  # DO NOT EDIT, updated automatically
+version = "3.4.2"  # DO NOT EDIT, updated automatically
 description = "This package contains the AI models used by the Docling PDF conversion package"
 authors = ["Nikos Livathinos <nli@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"