PyPI - docling - Versions diffs - 1.12.2__tar.gz → 1.13.1__tar.gz - Mend

docling 1.12.2tar.gz → 1.13.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{docling-1.12.2 → docling-1.13.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.12.2
+Version: 1.13.1
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -21,8 +21,8 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Provides-Extra: examples
 Requires-Dist: certifi (>=2024.7.4)
-Requires-Dist: deepsearch-glm (>=0.21.0,<0.22.0)
-Requires-Dist: docling-core (>=1.3.0,<2.0.0)
+Requires-Dist: deepsearch-glm (>=0.21.1,<0.22.0)
+Requires-Dist: docling-core (>=1.5.0,<2.0.0)
 Requires-Dist: docling-ibm-models (>=1.2.0,<2.0.0)
 Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
@@ -122,7 +122,9 @@ from docling.document_converter import DocumentConverter
 source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
 converter = DocumentConverter()
 result = converter.convert_single(source)
 print(result.render_as_markdown())  # output: "## Docling Technical Report[...]"
+print(result.render_as_doctags())  # output: "<document><title><page_1><loc_20>..."
 ```
 ### Convert a batch of documents

{docling-1.12.2 → docling-1.13.1}/README.md RENAMED Viewed

@@ -70,7 +70,9 @@ from docling.document_converter import DocumentConverter
 source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
 converter = DocumentConverter()
 result = converter.convert_single(source)
 print(result.render_as_markdown())  # output: "## Docling Technical Report[...]"
+print(result.render_as_doctags())  # output: "<document><title><page_1><loc_20>..."
 ```
 ### Convert a batch of documents

{docling-1.12.2 → docling-1.13.1}/docling/datamodel/document.py RENAMED Viewed

@@ -368,20 +368,30 @@ class ConvertedDocument(BaseModel):
             "table",
             "figure",
         ],
-        page_tagging: bool = True,
-        location_tagging: bool = True,
-        location_dimensions: Tuple[int, int] = (100, 100),
-        add_new_line: bool = True,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_content: bool = True,
+        add_page_index: bool = True,
+        # table specific flags
+        add_table_cell_location: bool = False,
+        add_table_cell_label: bool = True,
+        add_table_cell_text: bool = True,
     ) -> str:
         return self.output.export_to_document_tokens(
             delim=delim,
             main_text_start=main_text_start,
             main_text_stop=main_text_stop,
             main_text_labels=main_text_labels,
-            page_tagging=page_tagging,
-            location_tagging=location_tagging,
-            location_dimensions=location_dimensions,
-            add_new_line=add_new_line,
+            xsize=xsize,
+            ysize=ysize,
+            add_location=add_location,
+            add_content=add_content,
+            add_page_index=add_page_index,
+            # table specific flags
+            add_table_cell_location=add_table_cell_location,
+            add_table_cell_label=add_table_cell_label,
+            add_table_cell_text=add_table_cell_text,
         )
     def render_element_images(

{docling-1.12.2 → docling-1.13.1}/docling/utils/export.py RENAMED Viewed

@@ -9,67 +9,6 @@ from docling.datamodel.document import ConversionResult, Page
 _log = logging.getLogger(__name__)
-def _export_table_to_html(table: Table):
-    # TODO: this is flagged as internal, because we will move it
-    # to the docling-core package.
-    def _get_tablecell_span(cell: TableCell, ix):
-        if cell.spans is None:
-            span = set()
-        else:
-            span = set([s[ix] for s in cell.spans])
-        if len(span) == 0:
-            return 1, None, None
-        return len(span), min(span), max(span)
-    body = ""
-    nrows = table.num_rows
-    ncols = table.num_cols
-    if table.data is None:
-        return ""
-    for i in range(nrows):
-        body += "<tr>"
-        for j in range(ncols):
-            cell: TableCell = table.data[i][j]
-            rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
-            colspan, colstart, colend = _get_tablecell_span(cell, 1)
-            if rowstart is not None and rowstart != i:
-                continue
-            if colstart is not None and colstart != j:
-                continue
-            if rowstart is None:
-                rowstart = i
-            if colstart is None:
-                colstart = j
-            content = cell.text.strip()
-            label = cell.obj_type
-            label_class = "body"
-            celltag = "td"
-            if label in ["row_header", "row_multi_header", "row_title"]:
-                label_class = "header"
-            elif label in ["col_header", "col_multi_header"]:
-                label_class = "header"
-                celltag = "th"
-            opening_tag = f"{celltag}"
-            if rowspan > 1:
-                opening_tag += f' rowspan="{rowspan}"'
-            if colspan > 1:
-                opening_tag += f' colspan="{colspan}"'
-            body += f"<{opening_tag}>{content}</{celltag}>"
-        body += "</tr>"
-    body = f"<table>{body}</table>"
-    return body
 def generate_multimodal_pages(
     doc_result: ConversionResult,
 ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
@@ -129,7 +68,7 @@ def generate_multimodal_pages(
             }
             if isinstance(item, Table):
-                table_html = _export_table_to_html(item)
+                table_html = item.export_to_html()
                 new_segment["data"].append(
                     {
                         "html_seq": table_html,
@@ -172,7 +111,7 @@ def generate_multimodal_pages(
         )
         # No page-tagging since we only do 1 page at the time
         content_dt = doc.export_to_document_tokens(
-            main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
+            main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
         )
         return content_text, content_md, content_dt, page_cells, page_segments, page

{docling-1.12.2 → docling-1.13.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "1.12.2"  # DO NOT EDIT, updated automatically
+version = "1.13.1"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -23,9 +23,9 @@ packages = [{include = "docling"}]
 [tool.poetry.dependencies]
 python = "^3.10"
 pydantic = "^2.0.0"
-docling-core = "^1.3.0"
+docling-core = "^1.5.0"
 docling-ibm-models = "^1.2.0"
-deepsearch-glm = "^0.21.0"
+deepsearch-glm = "^0.21.1"
 filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"
 pydantic-settings = "^2.3.0"