PyPI - docling - Versions diffs - 2.13.0__tar.gz → 2.14.0__tar.gz - Mend

docling 2.13.0tar.gz → 2.14.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{docling-2.13.0 → docling-2.14.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.13.0
+Version: 2.14.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT

docling-2.14.0/docling/backend/xml/pubmed_backend.py ADDED Viewed

@@ -0,0 +1,592 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Any, Set, Union
+import lxml
+from bs4 import BeautifulSoup
+from docling_core.types.doc import (
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    TableCell,
+    TableData,
+)
+from lxml import etree
+from typing_extensions import TypedDict, override
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+_log = logging.getLogger(__name__)
+class Paragraph(TypedDict):
+    text: str
+    headers: list[str]
+class Author(TypedDict):
+    name: str
+    affiliation_names: list[str]
+class Table(TypedDict):
+    label: str
+    caption: str
+    content: str
+class FigureCaption(TypedDict):
+    label: str
+    caption: str
+class Reference(TypedDict):
+    author_names: str
+    title: str
+    journal: str
+    year: str
+class XMLComponents(TypedDict):
+    title: str
+    authors: list[Author]
+    abstract: str
+    paragraphs: list[Paragraph]
+    tables: list[Table]
+    figure_captions: list[FigureCaption]
+    references: list[Reference]
+class PubMedDocumentBackend(DeclarativeDocumentBackend):
+    """
+    The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
+    Achakulvisut et al., (2020).
+    Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
+    Journal of Open Source Software, 5(46), 1979,
+    https://doi.org/10.21105/joss.01979
+    """
+    @override
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        self.path_or_stream = path_or_stream
+        # Initialize parents for the document hierarchy
+        self.parents: dict = {}
+        self.valid = False
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.path_or_stream.seek(0)
+            self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
+            if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
+                self.valid = True
+        except Exception as exc:
+            raise RuntimeError(
+                f"Could not initialize PubMed backend for file with hash {self.document_hash}."
+            ) from exc
+    @override
+    def is_valid(self) -> bool:
+        return self.valid
+    @classmethod
+    @override
+    def supports_pagination(cls) -> bool:
+        return False
+    @override
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None
+    @classmethod
+    @override
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.XML_PUBMED}
+    @override
+    def convert(self) -> DoclingDocument:
+        # Create empty document
+        origin = DocumentOrigin(
+            filename=self.file.name or "file",
+            mimetype="application/xml",
+            binary_hash=self.document_hash,
+        )
+        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
+        _log.debug("Trying to convert PubMed XML document...")
+        # Get parsed XML components
+        xml_components: XMLComponents = self._parse()
+        # Add XML components to the document
+        doc = self._populate_document(doc, xml_components)
+        return doc
+    def _parse_title(self) -> str:
+        title: str = " ".join(
+            [
+                t.replace("\n", "")
+                for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
+            ]
+        )
+        return title
+    def _parse_authors(self) -> list[Author]:
+        # Get mapping between affiliation ids and names
+        affiliation_names = []
+        for affiliation_node in self.tree.xpath(".//aff[@id]"):
+            affiliation_names.append(
+                ": ".join([t for t in affiliation_node.itertext() if t != "\n"])
+            )
+        affiliation_ids_names = {
+            id: name
+            for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
+        }
+        # Get author names and affiliation names
+        authors: list[Author] = []
+        for author_node in self.tree.xpath(
+            './/contrib-group/contrib[@contrib-type="author"]'
+        ):
+            author: Author = {
+                "name": "",
+                "affiliation_names": [],
+            }
+            # Affiliation names
+            affiliation_ids = [
+                a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
+            ]
+            for id in affiliation_ids:
+                if id in affiliation_ids_names:
+                    author["affiliation_names"].append(affiliation_ids_names[id])
+            # Name
+            author["name"] = (
+                author_node.xpath("name/surname")[0].text
+                + " "
+                + author_node.xpath("name/given-names")[0].text
+            )
+            authors.append(author)
+        return authors
+    def _parse_abstract(self) -> str:
+        texts = []
+        for abstract_node in self.tree.xpath(".//abstract"):
+            for text in abstract_node.itertext():
+                texts.append(text.replace("\n", ""))
+        abstract: str = "".join(texts)
+        return abstract
+    def _parse_main_text(self) -> list[Paragraph]:
+        paragraphs: list[Paragraph] = []
+        for paragraph_node in self.tree.xpath("//body//p"):
+            # Skip captions
+            if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
+                continue
+            paragraph: Paragraph = {"text": "", "headers": []}
+            # Text
+            paragraph["text"] = "".join(
+                [t.replace("\n", "") for t in paragraph_node.itertext()]
+            )
+            # Header
+            path = "../title"
+            while len(paragraph_node.xpath(path)) > 0:
+                paragraph["headers"].append(
+                    "".join(
+                        [
+                            t.replace("\n", "")
+                            for t in paragraph_node.xpath(path)[0].itertext()
+                        ]
+                    )
+                )
+                path = "../" + path
+            paragraphs.append(paragraph)
+        return paragraphs
+    def _parse_tables(self) -> list[Table]:
+        tables: list[Table] = []
+        for table_node in self.tree.xpath(".//body//table-wrap"):
+            table: Table = {"label": "", "caption": "", "content": ""}
+            # Content
+            if len(table_node.xpath("table")) > 0:
+                table_content_node = table_node.xpath("table")[0]
+            elif len(table_node.xpath("alternatives/table")) > 0:
+                table_content_node = table_node.xpath("alternatives/table")[0]
+            else:
+                table_content_node = None
+            if table_content_node != None:
+                table["content"] = etree.tostring(table_content_node).decode("utf-8")
+            # Caption
+            if len(table_node.xpath("caption/p")) > 0:
+                caption_node = table_node.xpath("caption/p")[0]
+            elif len(table_node.xpath("caption/title")) > 0:
+                caption_node = table_node.xpath("caption/title")[0]
+            else:
+                caption_node = None
+            if caption_node != None:
+                table["caption"] = "".join(
+                    [t.replace("\n", "") for t in caption_node.itertext()]
+                )
+            # Label
+            if len(table_node.xpath("label")) > 0:
+                table["label"] = table_node.xpath("label")[0].text
+            tables.append(table)
+        return tables
+    def _parse_figure_captions(self) -> list[FigureCaption]:
+        figure_captions: list[FigureCaption] = []
+        if not (self.tree.xpath(".//fig")):
+            return figure_captions
+        for figure_node in self.tree.xpath(".//fig"):
+            figure_caption: FigureCaption = {
+                "caption": "",
+                "label": "",
+            }
+            # Label
+            if figure_node.xpath("label"):
+                figure_caption["label"] = "".join(
+                    [
+                        t.replace("\n", "")
+                        for t in figure_node.xpath("label")[0].itertext()
+                    ]
+                )
+            # Caption
+            if figure_node.xpath("caption"):
+                caption = ""
+                for caption_node in figure_node.xpath("caption")[0].getchildren():
+                    caption += (
+                        "".join([t.replace("\n", "") for t in caption_node.itertext()])
+                        + "\n"
+                    )
+                figure_caption["caption"] = caption
+            figure_captions.append(figure_caption)
+        return figure_captions
+    def _parse_references(self) -> list[Reference]:
+        references: list[Reference] = []
+        for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
+            reference: Reference = {
+                "author_names": "",
+                "title": "",
+                "journal": "",
+                "year": "",
+            }
+            reference_node: Any = None
+            for tag in ["mixed-citation", "element-citation", "citation"]:
+                if len(reference_node_abs.xpath(tag)) > 0:
+                    reference_node = reference_node_abs.xpath(tag)[0]
+                    break
+            if reference_node is None:
+                continue
+            if all(
+                not (ref_type in ["citation-type", "publication-type"])
+                for ref_type in reference_node.attrib.keys()
+            ):
+                continue
+            # Author names
+            names = []
+            if len(reference_node.xpath("name")) > 0:
+                for name_node in reference_node.xpath("name"):
+                    name_str = " ".join(
+                        [t.text for t in name_node.getchildren() if (t.text != None)]
+                    )
+                    names.append(name_str)
+            elif len(reference_node.xpath("person-group")) > 0:
+                for name_node in reference_node.xpath("person-group")[0]:
+                    name_str = (
+                        name_node.xpath("given-names")[0].text
+                        + " "
+                        + name_node.xpath("surname")[0].text
+                    )
+                    names.append(name_str)
+            reference["author_names"] = "; ".join(names)
+            # Title
+            if len(reference_node.xpath("article-title")) > 0:
+                reference["title"] = " ".join(
+                    [
+                        t.replace("\n", " ")
+                        for t in reference_node.xpath("article-title")[0].itertext()
+                    ]
+                )
+            # Journal
+            if len(reference_node.xpath("source")) > 0:
+                reference["journal"] = reference_node.xpath("source")[0].text
+            # Year
+            if len(reference_node.xpath("year")) > 0:
+                reference["year"] = reference_node.xpath("year")[0].text
+            if (
+                not (reference_node.xpath("article-title"))
+                and not (reference_node.xpath("journal"))
+                and not (reference_node.xpath("year"))
+            ):
+                reference["title"] = reference_node.text
+            references.append(reference)
+        return references
+    def _parse(self) -> XMLComponents:
+        """Parsing PubMed document."""
+        xml_components: XMLComponents = {
+            "title": self._parse_title(),
+            "authors": self._parse_authors(),
+            "abstract": self._parse_abstract(),
+            "paragraphs": self._parse_main_text(),
+            "tables": self._parse_tables(),
+            "figure_captions": self._parse_figure_captions(),
+            "references": self._parse_references(),
+        }
+        return xml_components
+    def _populate_document(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> DoclingDocument:
+        self._add_title(doc, xml_components)
+        self._add_authors(doc, xml_components)
+        self._add_abstract(doc, xml_components)
+        self._add_main_text(doc, xml_components)
+        if xml_components["tables"]:
+            self._add_tables(doc, xml_components)
+        if xml_components["figure_captions"]:
+            self._add_figure_captions(doc, xml_components)
+        self._add_references(doc, xml_components)
+        return doc
+    def _add_figure_captions(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> None:
+        self.parents["Figures"] = doc.add_heading(
+            parent=self.parents["Title"], text="Figures"
+        )
+        for figure_caption_xml_component in xml_components["figure_captions"]:
+            figure_caption_text = (
+                figure_caption_xml_component["label"]
+                + ": "
+                + figure_caption_xml_component["caption"].strip()
+            )
+            fig_caption = doc.add_text(
+                label=DocItemLabel.CAPTION, text=figure_caption_text
+            )
+            doc.add_picture(
+                parent=self.parents["Figures"],
+                caption=fig_caption,
+            )
+        return
+    def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
+        self.parents["Title"] = doc.add_text(
+            parent=None,
+            text=xml_components["title"],
+            label=DocItemLabel.TITLE,
+        )
+        return
+    def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
+        authors_affiliations: list = []
+        for author in xml_components["authors"]:
+            authors_affiliations.append(author["name"])
+            authors_affiliations.append(", ".join(author["affiliation_names"]))
+        authors_affiliations_str = "; ".join(authors_affiliations)
+        doc.add_text(
+            parent=self.parents["Title"],
+            text=authors_affiliations_str,
+            label=DocItemLabel.PARAGRAPH,
+        )
+        return
+    def _add_abstract(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> None:
+        abstract_text: str = xml_components["abstract"]
+        self.parents["Abstract"] = doc.add_heading(
+            parent=self.parents["Title"], text="Abstract"
+        )
+        doc.add_text(
+            parent=self.parents["Abstract"],
+            text=abstract_text,
+            label=DocItemLabel.TEXT,
+        )
+        return
+    def _add_main_text(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> None:
+        added_headers: list = []
+        for paragraph in xml_components["paragraphs"]:
+            if not (paragraph["headers"]):
+                continue
+            # Header
+            for i, header in enumerate(reversed(paragraph["headers"])):
+                if header in added_headers:
+                    continue
+                added_headers.append(header)
+                if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
+                    i - 1
+                ] in self.parents:
+                    parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
+                else:
+                    parent = self.parents["Title"]
+                self.parents[header] = doc.add_heading(parent=parent, text=header)
+            # Paragraph text
+            if paragraph["headers"][0] in self.parents:
+                parent = self.parents[paragraph["headers"][0]]
+            else:
+                parent = self.parents["Title"]
+            doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
+        return
+    def _add_references(
+        self, doc: DoclingDocument, xml_components: XMLComponents
+    ) -> None:
+        self.parents["References"] = doc.add_heading(
+            parent=self.parents["Title"], text="References"
+        )
+        current_list = doc.add_group(
+            parent=self.parents["References"], label=GroupLabel.LIST, name="list"
+        )
+        for reference in xml_components["references"]:
+            reference_text: str = ""
+            if reference["author_names"]:
+                reference_text += reference["author_names"] + ". "
+            if reference["title"]:
+                reference_text += reference["title"]
+                if reference["title"][-1] != ".":
+                    reference_text += "."
+                reference_text += " "
+            if reference["journal"]:
+                reference_text += reference["journal"]
+            if reference["year"]:
+                reference_text += " (" + reference["year"] + ")"
+            if not (reference_text):
+                _log.debug(f"Skipping reference for: {str(self.file)}")
+                continue
+            doc.add_list_item(
+                text=reference_text, enumerated=False, parent=current_list
+            )
+        return
+    def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
+        self.parents["Tables"] = doc.add_heading(
+            parent=self.parents["Title"], text="Tables"
+        )
+        for table_xml_component in xml_components["tables"]:
+            try:
+                self._add_table(doc, table_xml_component)
+            except Exception as e:
+                _log.debug(f"Skipping unsupported table for: {str(self.file)}")
+                pass
+        return
+    def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
+        soup = BeautifulSoup(table_xml_component["content"], "html.parser")
+        table_tag = soup.find("table")
+        nested_tables = table_tag.find("table")
+        if nested_tables:
+            _log.debug(f"Skipping nested table for: {str(self.file)}")
+            return
+        # Count the number of rows (number of <tr> elements)
+        num_rows = len(table_tag.find_all("tr"))
+        # Find the number of columns (taking into account colspan)
+        num_cols = 0
+        for row in table_tag.find_all("tr"):
+            col_count = 0
+            for cell in row.find_all(["td", "th"]):
+                colspan = int(cell.get("colspan", 1))
+                col_count += colspan
+            num_cols = max(num_cols, col_count)
+        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+        # Iterate over the rows in the table
+        for row_idx, row in enumerate(table_tag.find_all("tr")):
+            # For each row, find all the column cells (both <td> and <th>)
+            cells = row.find_all(["td", "th"])
+            # Check if each cell in the row is a header -> means it is a column header
+            col_header = True
+            for j, html_cell in enumerate(cells):
+                if html_cell.name == "td":
+                    col_header = False
+            # Extract and print the text content of each cell
+            col_idx = 0
+            for _, html_cell in enumerate(cells):
+                text = html_cell.text
+                col_span = int(html_cell.get("colspan", 1))
+                row_span = int(html_cell.get("rowspan", 1))
+                while grid[row_idx][col_idx] != None:
+                    col_idx += 1
+                for r in range(row_span):
+                    for c in range(col_span):
+                        grid[row_idx + r][col_idx + c] = text
+                cell = TableCell(
+                    text=text,
+                    row_span=row_span,
+                    col_span=col_span,
+                    start_row_offset_idx=row_idx,
+                    end_row_offset_idx=row_idx + row_span,
+                    start_col_offset_idx=col_idx,
+                    end_col_offset_idx=col_idx + col_span,
+                    col_header=col_header,
+                    row_header=((not col_header) and html_cell.name == "th"),
+                )
+                data.table_cells.append(cell)
+        table_caption = doc.add_text(
+            label=DocItemLabel.CAPTION,
+            text=table_xml_component["label"] + ": " + table_xml_component["caption"],
+        )
+        doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
+        return

{docling-2.13.0 → docling-2.14.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -33,6 +33,7 @@ class InputFormat(str, Enum):
     DOCX = "docx"
     PPTX = "pptx"
     HTML = "html"
+    XML_PUBMED = "xml_pubmed"
     IMAGE = "image"
     PDF = "pdf"
     ASCIIDOC = "asciidoc"
@@ -55,6 +56,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.PDF: ["pdf"],
     InputFormat.MD: ["md"],
     InputFormat.HTML: ["html", "htm", "xhtml"],
+    InputFormat.XML_PUBMED: ["xml", "nxml"],
     InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
     InputFormat.XLSX: ["xlsx"],
@@ -72,6 +74,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
         "application/vnd.openxmlformats-officedocument.presentationml.presentation",
     ],
     InputFormat.HTML: ["text/html", "application/xhtml+xml"],
+    InputFormat.XML_PUBMED: ["application/xml"],
     InputFormat.IMAGE: [
         "image/png",
         "image/jpeg",

{docling-2.13.0 → docling-2.14.0}/docling/datamodel/document.py RENAMED Viewed

@@ -292,8 +292,7 @@ class _DocumentConversionInput(BaseModel):
         mime = mime or "text/plain"
         formats = MimeTypeToFormat.get(mime, [])
         if formats:
-            # TODO: remove application/xml case after adding another XML parse
-            if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
+            if len(formats) == 1 and mime not in ("text/plain"):
                 return formats[0]
             else:  # ambiguity in formats
                 return _DocumentConversionInput._guess_from_content(
@@ -325,6 +324,12 @@ class _DocumentConversionInput(BaseModel):
                 ):
                     input_format = InputFormat.XML_USPTO
+                if (
+                    InputFormat.XML_PUBMED in formats
+                    and "/NLM//DTD JATS" in xml_doctype
+                ):
+                    input_format = InputFormat.XML_PUBMED
         elif mime == "text/plain":
             if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
                 input_format = InputFormat.XML_USPTO
@@ -340,7 +345,6 @@ class _DocumentConversionInput(BaseModel):
             mime = FormatToMimeType[InputFormat.HTML][0]
         elif ext in FormatToExtensions[InputFormat.MD]:
             mime = FormatToMimeType[InputFormat.MD][0]
         return mime
     @staticmethod
@@ -370,4 +374,10 @@ class _DocumentConversionInput(BaseModel):
         if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
             return "text/html"
+        p = re.compile(
+            r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
+        )
+        if p.search(content_str):
+            return "application/xml"
         return None

{docling-2.13.0 → docling-2.14.0}/docling/document_converter.py RENAMED Viewed

@@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
     ConversionStatus,
@@ -88,6 +89,11 @@ class PatentUsptoFormatOption(FormatOption):
     backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
+class XMLPubMedFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
 class ImageFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
     backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
@@ -121,6 +127,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.XML_USPTO: FormatOption(
             pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
         ),
+        InputFormat.XML_PUBMED: FormatOption(
+            pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
+        ),
         InputFormat.IMAGE: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
         ),
@@ -171,7 +180,6 @@ class DocumentConverter:
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,
     ) -> ConversionResult:
         all_res = self.convert_all(
             source=[source],
             raises_on_error=raises_on_error,

{docling-2.13.0 → docling-2.14.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.13.0"  # DO NOT EDIT, updated automatically
+version = "2.14.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"

{docling-2.13.0 → docling-2.14.0}/LICENSE RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/README.md RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/__init__.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/__init__.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/abstract_backend.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/asciidoc_backend.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/html_backend.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/md_backend.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/msexcel_backend.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/mspowerpoint_backend.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/msword_backend.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/pdf_backend.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/xml/__init__.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/backend/xml/uspto_backend.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/chunking/__init__.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/cli/__init__.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/cli/main.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/datamodel/__init__.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/datamodel/settings.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/exceptions.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/__init__.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/base_model.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/base_ocr_model.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/ds_glm_model.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/easyocr_model.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/layout_model.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/ocr_mac_model.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/page_assemble_model.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/page_preprocessing_model.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/rapid_ocr_model.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/table_structure_model.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/tesseract_ocr_cli_model.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/models/tesseract_ocr_model.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/pipeline/__init__.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/pipeline/base_pipeline.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/pipeline/simple_pipeline.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/pipeline/standard_pdf_pipeline.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/py.typed RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/utils/__init__.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/utils/accelerator_utils.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/utils/export.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/utils/glm_utils.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/utils/layout_postprocessor.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/utils/profiling.py RENAMED Viewed

File without changes

{docling-2.13.0 → docling-2.14.0}/docling/utils/utils.py RENAMED Viewed

File without changes

docling 2.13.0__tar.gz → 2.14.0__tar.gz

docling 2.13.0tar.gz → 2.14.0tar.gz