PyPI - docling - Versions diffs - 2.13.0__py3-none-any.whl → 2.15.0__py3-none-any.whl - Mend

docling 2.13.0py3-none-any.whl → 2.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

docling/backend/html_backend.py +2 -2
docling/backend/mspowerpoint_backend.py +15 -11
docling/backend/xml/pubmed_backend.py +592 -0
docling/cli/main.py +14 -2
docling/datamodel/base_models.py +3 -0
docling/datamodel/document.py +19 -4
docling/document_converter.py +13 -3
docling/models/base_ocr_model.py +14 -1
docling/models/layout_model.py +18 -25
docling/models/table_structure_model.py +20 -0
{docling-2.13.0.dist-info → docling-2.15.0.dist-info}/METADATA +2 -2
{docling-2.13.0.dist-info → docling-2.15.0.dist-info}/RECORD +15 -14
{docling-2.13.0.dist-info → docling-2.15.0.dist-info}/LICENSE +0 -0
{docling-2.13.0.dist-info → docling-2.15.0.dist-info}/WHEEL +0 -0
{docling-2.13.0.dist-info → docling-2.15.0.dist-info}/entry_points.txt +0 -0

docling/backend/html_backend.py CHANGED Viewed

@@ -37,10 +37,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         try:
             if isinstance(self.path_or_stream, BytesIO):
-                text_stream = self.path_or_stream.getvalue().decode("utf-8")
+                text_stream = self.path_or_stream.getvalue()
                 self.soup = BeautifulSoup(text_stream, "html.parser")
             if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, "rb") as f:
                     html_content = f.read()
                     self.soup = BeautifulSoup(html_content, "html.parser")
         except Exception as e:

docling/backend/mspowerpoint_backend.py CHANGED Viewed

@@ -16,7 +16,7 @@ from docling_core.types.doc import (
     TableCell,
     TableData,
 )
-from PIL import Image
+from PIL import Image, UnidentifiedImageError
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@@ -120,6 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         bullet_type = "None"
         list_text = ""
         list_label = GroupLabel.LIST
+        doc_label = DocItemLabel.LIST_ITEM
         prov = self.generate_prov(shape, slide_ind, shape.text.strip())
         # Identify if shape contains lists
@@ -276,16 +277,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         im_dpi, _ = image.dpi
         # Open it with PIL
-        pil_image = Image.open(BytesIO(image_bytes))
-        # shape has picture
-        prov = self.generate_prov(shape, slide_ind, "")
-        doc.add_picture(
-            parent=parent_slide,
-            image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
-            caption=None,
-            prov=prov,
-        )
+        try:
+            pil_image = Image.open(BytesIO(image_bytes))
+            # shape has picture
+            prov = self.generate_prov(shape, slide_ind, "")
+            doc.add_picture(
+                parent=parent_slide,
+                image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
+                caption=None,
+                prov=prov,
+            )
+        except (UnidentifiedImageError, OSError) as e:
+            _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
         return
     def handle_tables(self, shape, parent_slide, slide_ind, doc):

docling/backend/xml/pubmed_backend.py ADDED Viewed

@@ -0,0 +1,592 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Any, Set, Union
+import lxml
+from bs4 import BeautifulSoup
+from docling_core.types.doc import (
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    TableCell,
+    TableData,
+)
+from lxml import etree
+from typing_extensions import TypedDict, override
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+_log = logging.getLogger(__name__)
+class Paragraph(TypedDict):
+    text: str
+    headers: list[str]
+class Author(TypedDict):
+    name: str
+    affiliation_names: list[str]
+class Table(TypedDict):
+    label: str
+    caption: str
+    content: str
+class FigureCaption(TypedDict):
+    label: str
+    caption: str
+class Reference(TypedDict):
+    author_names: str
+    title: str
+    journal: str
+    year: str
+class XMLComponents(TypedDict):
+    title: str
+    authors: list[Author]
+    abstract: str
+    paragraphs: list[Paragraph]
+    tables: list[Table]
+    figure_captions: list[FigureCaption]
+    references: list[Reference]
+class PubMedDocumentBackend(DeclarativeDocumentBackend):
+    """
+    The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
+    Achakulvisut et al., (2020).
+    Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
+    Journal of Open Source Software, 5(46), 1979,
+    https://doi.org/10.21105/joss.01979
+    """
+    @override
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        self.path_or_stream = path_or_stream
+        # Initialize parents for the document hierarchy
+        self.parents: dict = {}
+        self.valid = False
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.path_or_stream.seek(0)
+            self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
+            if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
+                self.valid = True
+        except Exception as exc:
+            raise RuntimeError(
+                f"Could not initialize PubMed backend for file with hash {self.document_hash}."
+            ) from exc
+    @override
+    def is_valid(self) -> bool:
+        return self.valid
+    @classmethod
+    @override
+    def supports_pagination(cls) -> bool:
+        return False
+    @override
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None
+    @classmethod
+    @override
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.XML_PUBMED}
+    @override
+    def convert(self) -> DoclingDocument:
+        # Create empty document
+        origin = DocumentOrigin(
+            filename=self.file.name or "file",
+            mimetype="application/xml",
+            binary_hash=self.document_hash,
+        )
+        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
+        _log.debug("Trying to convert PubMed XML document...")
+        # Get parsed XML components
+        xml_components: XMLComponents = self._parse()
+        # Add XML components to the document
+        doc = self._populate_document(doc, xml_components)
+        return doc
+    def _parse_title(self) -> str:
+        title: str = " ".join(
+            [
+                t.replace("\n", "")
+                for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
+            ]
+        )
+        return title
+    def _parse_authors(self) -> list[Author]:
+        # Get mapping between affiliation ids and names
+        affiliation_names = []
+        for affiliation_node in self.tree.xpath(".//aff[@id]"):
+            affiliation_names.append(
+                ": ".join([t for t in affiliation_node.itertext() if t != "\n"])
+            )
+        affiliation_ids_names = {
+            id: name
+            for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
+        }
+        # Get author names and affiliation names
+        authors: list[Author] = []
+        for author_node in self.tree.xpath(
+            './/contrib-group/contrib[@contrib-type="author"]'
+        ):
+            author: Author = {
+                "name": "",
+                "affiliation_names": [],
+            }
+            # Affiliation names
+            affiliation_ids = [
+                a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
+            ]
+            for id in affiliation_ids:
+                if id in affiliation_ids_names:
+                    author["affiliation_names"].append(affiliation_ids_names[id])
+            # Name
+            author["name"] = (
+                author_node.xpath("name/surname")[0].text
+                + " "
+                + author_node.xpath("name/given-names")[0].text
+            )
+            authors.append(author)
+        return authors
+    def _parse_abstract(self) -> str:
+        texts = []
+        for abstract_node in self.tree.xpath(".//abstract"):
+            for text in abstract_node.itertext():
+                texts.append(text.replace("\n", ""))
+        abstract: str = "".join(texts)
+        return abstract
+    def _parse_main_text(self) -> list[Paragraph]:
+        paragraphs: list[Paragraph] = []
+        for paragraph_node in self.tree.xpath("//body//p"):
+            # Skip captions
+            if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
+                continue
+            paragraph: Paragraph = {"text": "", "headers": []}
+            # Text
+            paragraph["text"] = "".join(
+                [t.replace("\n", "") for t in paragraph_node.itertext()]
+            )
+            # Header
+            path = "../title"
+            while len(paragraph_node.xpath(path)) > 0:
+                paragraph["headers"].append(
+                    "".join(
+                        [
+                            t.replace("\n", "")
+                            for t in paragraph_node.xpath(path)[0].itertext()
+                        ]
+                    )
+                )
+                path = "../" + path
+            paragraphs.append(paragraph)
+        return paragraphs
+    def _parse_tables(self) -> list[Table]:
+        tables: list[Table] = []
+        for table_node in self.tree.xpath(".//body//table-wrap"):
+            table: Table = {"label": "", "caption": "", "content": ""}
+            # Content
+            if len(table_node.xpath("table")) > 0:
+                table_content_node = table_node.xpath("table")[0]
+            elif len(table_node.xpath("alternatives/table")) > 0:
+                table_content_node = table_node.xpath("alternatives/table")[0]
+            else:
+                table_content_node = None
+            if table_content_node != None:
+                table["content"] = etree.tostring(table_content_node).decode("utf-8")
+            # Caption
+            if len(table_node.xpath("caption/p")) > 0:
+                caption_node = table_node.xpath("caption/p")[0]
+            elif len(table_node.xpath("caption/title")) > 0:
+                caption_node = table_node.xpath("caption/title")[0]
+            else:
+                caption_node = None
+            if caption_node != None:
+                table["caption"] = "".join(
+                    [t.replace("\n", "") for t in caption_node.itertext()]
+                )
+            # Label
+            if len(table_node.xpath("label")) > 0:
+                table["label"] = table_node.xpath("label")[0].text
+            tables.append(table)
+        return tables
+    def _parse_figure_captions(self) -> list[FigureCaption]:
+        figure_captions: list[FigureCaption] = []
+        if not (self.tree.xpath(".//fig")):
+            return figure_captions
+        for figure_node in self.tree.xpath(".//fig"):
+            figure_caption: FigureCaption = {
+                "caption": "",
+                "label": "",
+            }
+            # Label
+            if figure_node.xpath("label"):
+                figure_caption["label"] = "".join(
+                    [
+                        t.replace("\n", "")
+                        for t in figure_node.xpath("label")[0].itertext()
+                    ]
+                )
+            # Caption
+            if figure_node.xpath("caption"):
+                caption = ""
+                for caption_node in figure_node.xpath("caption")[0].getchildren():
+                    caption += (
+                        "".join([t.replace("\n", "") for t in caption_node.itertext()])
+                        + "\n"
+                    )
+                figure_caption["caption"] = caption
+            figure_captions.append(figure_caption)
+        return figure_captions
+    def _parse_references(self) -> list[Reference]:
+        references: list[Reference] = []
+        for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
+            reference: Reference = {
+                "author_names": "",
+                "title": "",
+                "journal": "",
+                "year": "",
+            }
+            reference_node: Any = None
+            for tag in ["mixed-citation", "element-citation", "citation"]:
+                if len(reference_node_abs.xpath(tag)) > 0:
+                    reference_node = reference_node_abs.xpath(tag)[0]
+                    break
+            if reference_node is None:
+                continue
+            if all(
+                not (ref_type in ["citation-type", "publication-type"])
+                for ref_type in reference_node.attrib.keys()
+            ):
+                continue
+            # Author names
+            names = []
+            if len(reference_node.xpath("name")) > 0:
+                for name_node in reference_node.xpath("name"):
+                    name_str = " ".join(
+                        [t.text for t in name_node.getchildren() if (t.text != None)]
+                    )
+                    names.append(name_str)
+            elif len(reference_node.xpath("person-group")) > 0:
+                for name_node in reference_node.xpath("person-group")[0]:
+                    name_str = (
+                        name_node.xpath("given-names")[0].text
+                        + " "
+                        + name_node.xpath("surname")[0].text
+                    )
+                    names.append(name_str)
+            reference["author_names"] = "; ".join(names)
+            # Title
+            if len(reference_node.xpath("article-title")) > 0:
+                reference["title"] = " ".join(
+                    [
+                        t.replace("\n", " ")
+                        for t in reference_node.xpath("article-title")[0].itertext()
+                    ]
+                )
+            # Journal
+            if len(reference_node.xpath("source")) > 0:
+                reference["journal"] = reference_node.xpath("source")[0].text
+            # Year
+            if len(reference_node.xpath("year")) > 0:
+                reference["year"] = reference_node.xpath("year")[0].text
+            if (
+                not (reference_node.xpath("article-title"))
+                and not (reference_node.xpath("journal"))
+                and not (reference_node.xpath("year"))
+            ):
+                reference["title"] = reference_node.text
+            references.append(reference)
+        return references
+    def _parse(self) -> XMLComponents:
+        """Parsing PubMed document."""
+        xml_components: XMLComponents = {
+            "title": self._parse_title(),
+            "authors": self._parse_authors(),
+            "abstract": self._parse_abstract(),
+            "paragraphs": self._parse_main_text(),
+            "tables": self._parse_tables(),
+            "figure_captions": self._parse_figure_captions(),
+            "references": self._parse_references(),
+        }
+        return xml_components
+    def _populate_document(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> DoclingDocument:
+        self._add_title(doc, xml_components)
+        self._add_authors(doc, xml_components)
+        self._add_abstract(doc, xml_components)
+        self._add_main_text(doc, xml_components)
+        if xml_components["tables"]:
+            self._add_tables(doc, xml_components)
+        if xml_components["figure_captions"]:
+            self._add_figure_captions(doc, xml_components)
+        self._add_references(doc, xml_components)
+        return doc
+    def _add_figure_captions(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> None:
+        self.parents["Figures"] = doc.add_heading(
+            parent=self.parents["Title"], text="Figures"
+        )
+        for figure_caption_xml_component in xml_components["figure_captions"]:
+            figure_caption_text = (
+                figure_caption_xml_component["label"]
+                + ": "
+                + figure_caption_xml_component["caption"].strip()
+            )
+            fig_caption = doc.add_text(
+                label=DocItemLabel.CAPTION, text=figure_caption_text
+            )
+            doc.add_picture(
+                parent=self.parents["Figures"],
+                caption=fig_caption,
+            )
+        return
+    def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
+        self.parents["Title"] = doc.add_text(
+            parent=None,
+            text=xml_components["title"],
+            label=DocItemLabel.TITLE,
+        )
+        return
+    def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
+        authors_affiliations: list = []
+        for author in xml_components["authors"]:
+            authors_affiliations.append(author["name"])
+            authors_affiliations.append(", ".join(author["affiliation_names"]))
+        authors_affiliations_str = "; ".join(authors_affiliations)
+        doc.add_text(
+            parent=self.parents["Title"],
+            text=authors_affiliations_str,
+            label=DocItemLabel.PARAGRAPH,
+        )
+        return
+    def _add_abstract(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> None:
+        abstract_text: str = xml_components["abstract"]
+        self.parents["Abstract"] = doc.add_heading(
+            parent=self.parents["Title"], text="Abstract"
+        )
+        doc.add_text(
+            parent=self.parents["Abstract"],
+            text=abstract_text,
+            label=DocItemLabel.TEXT,
+        )
+        return
+    def _add_main_text(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> None:
+        added_headers: list = []
+        for paragraph in xml_components["paragraphs"]:
+            if not (paragraph["headers"]):
+                continue
+            # Header
+            for i, header in enumerate(reversed(paragraph["headers"])):
+                if header in added_headers:
+                    continue
+                added_headers.append(header)
+                if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
+                    i - 1
+                ] in self.parents:
+                    parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
+                else:
+                    parent = self.parents["Title"]
+                self.parents[header] = doc.add_heading(parent=parent, text=header)
+            # Paragraph text
+            if paragraph["headers"][0] in self.parents:
+                parent = self.parents[paragraph["headers"][0]]
+            else:
+                parent = self.parents["Title"]
+            doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
+        return
+    def _add_references(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> None:
+        self.parents["References"] = doc.add_heading(
+            parent=self.parents["Title"], text="References"
+        )
+        current_list = doc.add_group(
+            parent=self.parents["References"], label=GroupLabel.LIST, name="list"
+        )
+        for reference in xml_components["references"]:
+            reference_text: str = ""
+            if reference["author_names"]:
+                reference_text += reference["author_names"] + ". "
+            if reference["title"]:
+                reference_text += reference["title"]
+                if reference["title"][-1] != ".":
+                    reference_text += "."
+                reference_text += " "
+            if reference["journal"]:
+                reference_text += reference["journal"]
+            if reference["year"]:
+                reference_text += " (" + reference["year"] + ")"
+            if not (reference_text):
+                _log.debug(f"Skipping reference for: {str(self.file)}")
+                continue
+            doc.add_list_item(
+                text=reference_text, enumerated=False, parent=current_list
+            )
+        return
+    def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
+        self.parents["Tables"] = doc.add_heading(
+            parent=self.parents["Title"], text="Tables"
+        )
+        for table_xml_component in xml_components["tables"]:
+            try:
+                self._add_table(doc, table_xml_component)
+            except Exception as e:
+                _log.debug(f"Skipping unsupported table for: {str(self.file)}")
+                pass
+        return
+    def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
+        soup = BeautifulSoup(table_xml_component["content"], "html.parser")
+        table_tag = soup.find("table")
+        nested_tables = table_tag.find("table")
+        if nested_tables:
+            _log.debug(f"Skipping nested table for: {str(self.file)}")
+            return
+        # Count the number of rows (number of <tr> elements)
+        num_rows = len(table_tag.find_all("tr"))
+        # Find the number of columns (taking into account colspan)
+        num_cols = 0
+        for row in table_tag.find_all("tr"):
+            col_count = 0
+            for cell in row.find_all(["td", "th"]):
+                colspan = int(cell.get("colspan", 1))
+                col_count += colspan
+            num_cols = max(num_cols, col_count)
+        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+        # Iterate over the rows in the table
+        for row_idx, row in enumerate(table_tag.find_all("tr")):
+            # For each row, find all the column cells (both <td> and <th>)
+            cells = row.find_all(["td", "th"])
+            # Check if each cell in the row is a header -> means it is a column header
+            col_header = True
+            for j, html_cell in enumerate(cells):
+                if html_cell.name == "td":
+                    col_header = False
+            # Extract and print the text content of each cell
+            col_idx = 0
+            for _, html_cell in enumerate(cells):
+                text = html_cell.text
+                col_span = int(html_cell.get("colspan", 1))
+                row_span = int(html_cell.get("rowspan", 1))
+                while grid[row_idx][col_idx] != None:
+                    col_idx += 1
+                for r in range(row_span):
+                    for c in range(col_span):
+                        grid[row_idx + r][col_idx + c] = text
+                cell = TableCell(
+                    text=text,
+                    row_span=row_span,
+                    col_span=col_span,
+                    start_row_offset_idx=row_idx,
+                    end_row_offset_idx=row_idx + row_span,
+                    start_col_offset_idx=col_idx,
+                    end_col_offset_idx=col_idx + col_span,
+                    col_header=col_header,
+                    row_header=((not col_header) and html_cell.name == "th"),
+                )
+                data.table_cells.append(cell)
+        table_caption = doc.add_text(
+            label=DocItemLabel.CAPTION,
+            text=table_xml_component["label"] + ": " + table_xml_component["caption"],
+        )
+        doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
+        return

docling/cli/main.py CHANGED Viewed

@@ -164,6 +164,11 @@ def convert(
     to_formats: List[OutputFormat] = typer.Option(
         None, "--to", help="Specify output formats. Defaults to Markdown."
     ),
+    headers: str = typer.Option(
+        None,
+        "--headers",
+        help="Specify http request headers used when fetching url input sources in the form of a JSON string",
+    ),
     image_export_mode: Annotated[
         ImageRefMode,
         typer.Option(
@@ -279,12 +284,19 @@ def convert(
     if from_formats is None:
         from_formats = [e for e in InputFormat]
+    parsed_headers: Optional[Dict[str, str]] = None
+    if headers is not None:
+        headers_t = TypeAdapter(Dict[str, str])
+        parsed_headers = headers_t.validate_json(headers)
     with tempfile.TemporaryDirectory() as tempdir:
         input_doc_paths: List[Path] = []
         for src in input_sources:
             try:
                 # check if we can fetch some remote url
-                source = resolve_source_to_path(source=src, workdir=Path(tempdir))
+                source = resolve_source_to_path(
+                    source=src, headers=parsed_headers, workdir=Path(tempdir)
+                )
                 input_doc_paths.append(source)
             except FileNotFoundError:
                 err_console.print(
@@ -390,7 +402,7 @@ def convert(
         start_time = time.time()
         conv_results = doc_converter.convert_all(
-            input_doc_paths, raises_on_error=abort_on_error
+            input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
         )
         output.mkdir(parents=True, exist_ok=True)

docling/datamodel/base_models.py CHANGED Viewed

@@ -33,6 +33,7 @@ class InputFormat(str, Enum):
     DOCX = "docx"
     PPTX = "pptx"
     HTML = "html"
+    XML_PUBMED = "xml_pubmed"
     IMAGE = "image"
     PDF = "pdf"
     ASCIIDOC = "asciidoc"
@@ -55,6 +56,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.PDF: ["pdf"],
     InputFormat.MD: ["md"],
     InputFormat.HTML: ["html", "htm", "xhtml"],
+    InputFormat.XML_PUBMED: ["xml", "nxml"],
     InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
     InputFormat.XLSX: ["xlsx"],
@@ -72,6 +74,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
         "application/vnd.openxmlformats-officedocument.presentationml.presentation",
     ],
     InputFormat.HTML: ["text/html", "application/xhtml+xml"],
+    InputFormat.XML_PUBMED: ["application/xml"],
     InputFormat.IMAGE: [
         "image/png",
         "image/jpeg",

docling/datamodel/document.py CHANGED Viewed

@@ -227,13 +227,18 @@ class _DummyBackend(AbstractDocumentBackend):
 class _DocumentConversionInput(BaseModel):
     path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
+    headers: Optional[Dict[str, str]] = None
     limits: Optional[DocumentLimits] = DocumentLimits()
     def docs(
         self, format_options: Dict[InputFormat, "FormatOption"]
     ) -> Iterable[InputDocument]:
         for item in self.path_or_stream_iterator:
-            obj = resolve_source_to_stream(item) if isinstance(item, str) else item
+            obj = (
+                resolve_source_to_stream(item, self.headers)
+                if isinstance(item, str)
+                else item
+            )
             format = self._guess_format(obj)
             backend: Type[AbstractDocumentBackend]
             if format not in format_options.keys():
@@ -292,8 +297,7 @@ class _DocumentConversionInput(BaseModel):
         mime = mime or "text/plain"
         formats = MimeTypeToFormat.get(mime, [])
         if formats:
-            # TODO: remove application/xml case after adding another XML parse
-            if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
+            if len(formats) == 1 and mime not in ("text/plain"):
                 return formats[0]
             else:  # ambiguity in formats
                 return _DocumentConversionInput._guess_from_content(
@@ -325,6 +329,12 @@ class _DocumentConversionInput(BaseModel):
                 ):
                     input_format = InputFormat.XML_USPTO
+                if (
+                    InputFormat.XML_PUBMED in formats
+                    and "/NLM//DTD JATS" in xml_doctype
+                ):
+                    input_format = InputFormat.XML_PUBMED
         elif mime == "text/plain":
             if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
                 input_format = InputFormat.XML_USPTO
@@ -340,7 +350,6 @@ class _DocumentConversionInput(BaseModel):
             mime = FormatToMimeType[InputFormat.HTML][0]
         elif ext in FormatToExtensions[InputFormat.MD]:
             mime = FormatToMimeType[InputFormat.MD][0]
         return mime
     @staticmethod
@@ -370,4 +379,10 @@ class _DocumentConversionInput(BaseModel):
         if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
             return "text/html"
+        p = re.compile(
+            r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
+        )
+        if p.search(content_str):
+            return "application/xml"
         return None

docling/document_converter.py CHANGED Viewed

@@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
     ConversionStatus,
@@ -88,6 +89,11 @@ class PatentUsptoFormatOption(FormatOption):
     backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
+class XMLPubMedFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
 class ImageFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
     backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
@@ -121,6 +127,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.XML_USPTO: FormatOption(
             pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
         ),
+        InputFormat.XML_PUBMED: FormatOption(
+            pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
+        ),
         InputFormat.IMAGE: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
         ),
@@ -167,16 +176,17 @@ class DocumentConverter:
     def convert(
         self,
         source: Union[Path, str, DocumentStream],  # TODO review naming
+        headers: Optional[Dict[str, str]] = None,
         raises_on_error: bool = True,
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,
     ) -> ConversionResult:
         all_res = self.convert_all(
             source=[source],
             raises_on_error=raises_on_error,
             max_num_pages=max_num_pages,
             max_file_size=max_file_size,
+            headers=headers,
         )
         return next(all_res)
@@ -184,6 +194,7 @@ class DocumentConverter:
     def convert_all(
         self,
         source: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
+        headers: Optional[Dict[str, str]] = None,
         raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,
@@ -193,8 +204,7 @@ class DocumentConverter:
             max_file_size=max_file_size,
         )
         conv_input = _DocumentConversionInput(
-            path_or_stream_iterator=source,
-            limits=limits,
+            path_or_stream_iterator=source, limits=limits, headers=headers
         )
         conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)

docling/models/base_ocr_model.py CHANGED Viewed

@@ -138,18 +138,31 @@ class BaseOcrModel(BasePageModel):
     def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
         image = copy.deepcopy(page.image)
+        scale_x = image.width / page.size.width
+        scale_y = image.height / page.size.height
         draw = ImageDraw.Draw(image, "RGBA")
         # Draw OCR rectangles as yellow filled rect
         for rect in ocr_rects:
             x0, y0, x1, y1 = rect.as_tuple()
+            y0 *= scale_x
+            y1 *= scale_y
+            x0 *= scale_x
+            x1 *= scale_x
             shade_color = (255, 255, 0, 40)  # transparent yellow
             draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
         # Draw OCR and programmatic cells
         for tc in page.cells:
             x0, y0, x1, y1 = tc.bbox.as_tuple()
-            color = "red"
+            y0 *= scale_x
+            y1 *= scale_y
+            x0 *= scale_x
+            x1 *= scale_x
+            color = "gray"
             if isinstance(tc, OcrCell):
                 color = "magenta"
             draw.rectangle([(x0, y0), (x1, y1)], outline=color)

docling/models/layout_model.py CHANGED Viewed

@@ -67,29 +67,9 @@ class LayoutModel(BasePageModel):
         - Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
         Includes label names and confidence scores for each cluster.
         """
-        label_to_color = {
-            DocItemLabel.TEXT: (255, 255, 153),  # Light Yellow
-            DocItemLabel.CAPTION: (255, 204, 153),  # Light Orange
-            DocItemLabel.LIST_ITEM: (153, 153, 255),  # Light Purple
-            DocItemLabel.FORMULA: (192, 192, 192),  # Gray
-            DocItemLabel.TABLE: (255, 204, 204),  # Light Pink
-            DocItemLabel.PICTURE: (255, 204, 164),  # Light Beige
-            DocItemLabel.SECTION_HEADER: (255, 153, 153),  # Light Red
-            DocItemLabel.PAGE_HEADER: (204, 255, 204),  # Light Green
-            DocItemLabel.PAGE_FOOTER: (
-                204,
-                255,
-                204,
-            ),  # Light Green (same as Page-Header)
-            DocItemLabel.TITLE: (255, 153, 153),  # Light Red (same as Section-Header)
-            DocItemLabel.FOOTNOTE: (200, 200, 255),  # Light Blue
-            DocItemLabel.DOCUMENT_INDEX: (220, 220, 220),  # Light Gray
-            DocItemLabel.CODE: (125, 125, 125),  # Gray
-            DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193),  # Pale Green
-            DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193),  # Light Pink
-            DocItemLabel.FORM: (200, 255, 255),  # Light Cyan
-            DocItemLabel.KEY_VALUE_REGION: (183, 65, 14),  # Rusty orange
-        }
+        scale_x = page.image.width / page.size.width
+        scale_y = page.image.height / page.size.height
         # Filter clusters for left and right images
         exclude_labels = {
             DocItemLabel.FORM,
@@ -118,6 +98,11 @@ class LayoutModel(BasePageModel):
                     cell_color = (0, 0, 0, 40)  # Transparent black for cells
                     for tc in c.cells:
                         cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
+                        cx0 *= scale_x
+                        cx1 *= scale_x
+                        cy0 *= scale_x
+                        cy1 *= scale_y
                         draw.rectangle(
                             [(cx0, cy0), (cx1, cy1)],
                             outline=None,
@@ -125,8 +110,16 @@ class LayoutModel(BasePageModel):
                         )
                     # Draw cluster rectangle
                     x0, y0, x1, y1 = c.bbox.as_tuple()
-                    cluster_fill_color = (*list(label_to_color.get(c.label)), 70)
-                    cluster_outline_color = (*list(label_to_color.get(c.label)), 255)
+                    x0 *= scale_x
+                    x1 *= scale_x
+                    y0 *= scale_x
+                    y1 *= scale_y
+                    cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
+                    cluster_outline_color = (
+                        *list(DocItemLabel.get_color(c.label)),
+                        255,
+                    )
                     draw.rectangle(
                         [(x0, y0), (x1, y1)],
                         outline=cluster_outline_color,

docling/models/table_structure_model.py CHANGED Viewed

@@ -66,23 +66,43 @@ class TableStructureModel(BasePageModel):
         show: bool = False,
     ):
         assert page._backend is not None
+        assert page.size is not None
         image = (
             page._backend.get_page_image()
         )  # make new image to avoid drawing on the saved ones
+        scale_x = image.width / page.size.width
+        scale_y = image.height / page.size.height
         draw = ImageDraw.Draw(image)
         for table_element in tbl_list:
             x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
+            y0 *= scale_x
+            y1 *= scale_y
+            x0 *= scale_x
+            x1 *= scale_x
             draw.rectangle([(x0, y0), (x1, y1)], outline="red")
             for cell in table_element.cluster.cells:
                 x0, y0, x1, y1 = cell.bbox.as_tuple()
+                x0 *= scale_x
+                x1 *= scale_x
+                y0 *= scale_x
+                y1 *= scale_y
                 draw.rectangle([(x0, y0), (x1, y1)], outline="green")
             for tc in table_element.table_cells:
                 if tc.bbox is not None:
                     x0, y0, x1, y1 = tc.bbox.as_tuple()
+                    x0 *= scale_x
+                    x1 *= scale_x
+                    y0 *= scale_x
+                    y1 *= scale_y
                     if tc.column_header:
                         width = 3
                     else:

{docling-2.13.0.dist-info → docling-2.15.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.13.0
+Version: 2.15.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
-Requires-Dist: docling-core[chunking] (>=2.12.1,<3.0.0)
+Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
 Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
 Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)

{docling-2.13.0.dist-info → docling-2.15.0.dist-info}/RECORD RENAMED Viewed

@@ -4,36 +4,37 @@ docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq
 docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
 docling/backend/docling_parse_backend.py,sha256=_jY5f5-KGI3hi5pcZAY6e7tPLocSi5JUWrxraDVszqI,7631
 docling/backend/docling_parse_v2_backend.py,sha256=1TDUdMIp3fEjCWBNjusUHiCUmH1g6yZQ-b13scofP0Y,8637
-docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaodAQ,15593
+docling/backend/html_backend.py,sha256=O8qXaw7MzOIdaxbBcjHieM9Ce4GEdtBj9YW0vpJspuA,15560
 docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
 docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
-docling/backend/mspowerpoint_backend.py,sha256=QD0NaatTO8U9CIFoiipkq3X5HxLZaaahH8nlrQ6ecDA,15710
+docling/backend/mspowerpoint_backend.py,sha256=kOGawhcn0BFq4M_C6kW0mY8vMIB24_6R6q6GaszbSt0,15957
 docling/backend/msword_backend.py,sha256=K1D_h0ulLA6KQsPe62327cDVkQqV1f7EetCHo66wCKw,19233
 docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
 docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
 docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-FfC9iSKk,20447
 docling/backend/xml/uspto_backend.py,sha256=2YsnB-WRARIAaHPL6gxHePP24GQGi-Up2_K8ZapD3k4,70974
 docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/cli/main.py,sha256=SdavhL0VTApK9JrKz0Pc1IYdnQhK-0OOaGT8zlTiN5c,15022
+docling/cli/main.py,sha256=NR7NEt8Sf3FE9D7sHpEmABM9mFMTMO5w0VPwYIIvVsk,15481
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/datamodel/base_models.py,sha256=vUQkOUawcZvJz_6E8RCxxd7wIN83B56h_FOQfdJAU1w,6105
-docling/datamodel/document.py,sha256=V-K0_BZIHG1VL9YpA-TUyP23p2ZYlFH0BfFrwBswA4U,12647
+docling/datamodel/base_models.py,sha256=50Jf5zk9c4-zmnOzZLoPBnHQhTX0_OFQzIkKgnKK1o4,6229
+docling/datamodel/document.py,sha256=OHM6bm0a-62xnAZ8DFlMHzATmbgNcfMxQoQO2udaW5Q,13071
 docling/datamodel/pipeline_options.py,sha256=u37Q12FVfu1UTEhgBiZ2KslyBtG3z3Eobqvaqd_MYaA,7735
 docling/datamodel/settings.py,sha256=Sw0rN_f8rdLV1eNvVeKiyET2Oe6oz9jtW3lJzniW9Do,1302
-docling/document_converter.py,sha256=ggJ0zv7qhm-_Vol2GkLHTTArb03p6g9kIS4PX66Wi5A,11950
+docling/document_converter.py,sha256=_pk0sHuPXJ14NEutatf5bK2VyNiU5cvYsVbh1HIgrIw,12431
 docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
-docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
+docling/models/base_ocr_model.py,sha256=qILpSHaqczAd1eUQzuoLxN-TYz3zozmN0K5_7kCWkrM,6738
 docling/models/ds_glm_model.py,sha256=CkhsP0cEWwm4wb1g3cLFriVGpVtELiUK3REDMkPwAMw,13028
 docling/models/easyocr_model.py,sha256=Kakb20ioBxDmNsIqoGvSSs_vbqAWN3QQNHYtEi-eErg,4990
-docling/models/layout_model.py,sha256=skfFdWh_NgijR4bIqyUH8zlda5mMOIIdN3yMttdmsN8,9871
+docling/models/layout_model.py,sha256=Xo8sclRTOO_V8Cr4RwuxB67vSWKF0LZ5nJRYU1WI--k,9063
 docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
 docling/models/page_assemble_model.py,sha256=qdEX0AIb76ZOqJV6O9j-7r67WmuIkUlwbb2PsL7eFK4,7608
 docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
 docling/models/rapid_ocr_model.py,sha256=LOIvczJs3_db2o8mtrKk-pIXgC-xqWqRLu2cjA3wvy4,4980
-docling/models/table_structure_model.py,sha256=3bUBeP26WwDNCb5_aAlRwVZe4xUYgnwsSHgWQYZxk9E,8892
+docling/models/table_structure_model.py,sha256=fUpCHthO4Uk3BhA99a85BHBm51fmdE9kfqhAk3WjuBw,9392
 docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
 docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -48,8 +49,8 @@ docling/utils/glm_utils.py,sha256=IB19wToGath97gD3jAA3G_rQSptnZKhQCWLvPUCnkww,11
 docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-2.13.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.13.0.dist-info/METADATA,sha256=lvbM7MRSyjnE30dP9UPdJQPACQ8jEolnBvoqbr1kcVA,7732
-docling-2.13.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-2.13.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
-docling-2.13.0.dist-info/RECORD,,
+docling-2.15.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.15.0.dist-info/METADATA,sha256=VglEfKqffhUESHax5WQgtOT_Fysyea5HLDFtf7yUpdM,7732
+docling-2.15.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-2.15.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
+docling-2.15.0.dist-info/RECORD,,

{docling-2.13.0.dist-info → docling-2.15.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-2.13.0.dist-info → docling-2.15.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.13.0.dist-info → docling-2.15.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling 2.13.0__py3-none-any.whl → 2.15.0__py3-none-any.whl

docling 2.13.0py3-none-any.whl → 2.15.0py3-none-any.whl