PyPI - docling - Versions diffs - 2.5.2__tar.gz → 2.6.0__tar.gz - Mend

docling 2.5.2tar.gz → 2.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{docling-2.5.2 → docling-2.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.5.2
+Version: 2.6.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -23,13 +23,14 @@ Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
-Requires-Dist: docling-core (>=2.3.0,<3.0.0)
+Requires-Dist: docling-core (>=2.4.0,<3.0.0)
 Requires-Dist: docling-ibm-models (>=2.0.3,<3.0.0)
 Requires-Dist: docling-parse (>=2.0.2,<3.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
 Requires-Dist: marko (>=2.1.2,<3.0.0)
+Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
 Requires-Dist: pandas (>=2.1.4,<3.0.0)
 Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)

docling-2.6.0/docling/backend/msexcel_backend.py ADDED Viewed

@@ -0,0 +1,374 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, Set, Tuple, Union
+from docling_core.types.doc import (
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    ImageRef,
+    TableCell,
+    TableData,
+)
+# from lxml import etree
+from openpyxl import Workbook, load_workbook
+from openpyxl.cell.cell import Cell
+from openpyxl.drawing.image import Image
+from openpyxl.worksheet.worksheet import Worksheet
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+_log = logging.getLogger(__name__)
+from typing import Any, List
+from pydantic import BaseModel
+class ExcelCell(BaseModel):
+    row: int
+    col: int
+    text: str
+    row_span: int
+    col_span: int
+class ExcelTable(BaseModel):
+    num_rows: int
+    num_cols: int
+    data: List[ExcelCell]
+class MsExcelDocumentBackend(DeclarativeDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        # Initialise the parents for the hierarchy
+        self.max_levels = 10
+        self.parents: Dict[int, Any] = {}
+        for i in range(-1, self.max_levels):
+            self.parents[i] = None
+        self.workbook = None
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.workbook = load_workbook(filename=self.path_or_stream)
+            elif isinstance(self.path_or_stream, Path):
+                self.workbook = load_workbook(filename=str(self.path_or_stream))
+            self.valid = True
+        except Exception as e:
+            self.valid = False
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
+            ) from e
+    def is_valid(self) -> bool:
+        _log.info(f"valid: {self.valid}")
+        return self.valid
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return True
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.XLSX}
+    def convert(self) -> DoclingDocument:
+        # Parses the XLSX into a structured document model.
+        origin = DocumentOrigin(
+            filename=self.file.name or "file.xlsx",
+            mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            binary_hash=self.document_hash,
+        )
+        doc = DoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
+        if self.is_valid():
+            doc = self._convert_workbook(doc)
+        else:
+            raise RuntimeError(
+                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
+            )
+        return doc
+    def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
+        if self.workbook is not None:
+            # Iterate over all sheets
+            for sheet_name in self.workbook.sheetnames:
+                _log.info(f"Processing sheet: {sheet_name}")
+                # Access the sheet by name
+                sheet = self.workbook[sheet_name]
+                self.parents[0] = doc.add_group(
+                    parent=None,
+                    label=GroupLabel.SECTION,
+                    name=f"sheet: {sheet_name}",
+                )
+                doc = self._convert_sheet(doc, sheet)
+        else:
+            _log.error("Workbook is not initialized.")
+        return doc
+    def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
+        doc = self._find_tables_in_sheet(doc, sheet)
+        doc = self._find_images_in_sheet(doc, sheet)
+        return doc
+    def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
+        tables = self._find_data_tables(sheet)
+        for excel_table in tables:
+            num_rows = excel_table.num_rows
+            num_cols = excel_table.num_cols
+            table_data = TableData(
+                num_rows=num_rows,
+                num_cols=num_cols,
+                table_cells=[],
+            )
+            for excel_cell in excel_table.data:
+                cell = TableCell(
+                    text=excel_cell.text,
+                    row_span=excel_cell.row_span,
+                    col_span=excel_cell.col_span,
+                    start_row_offset_idx=excel_cell.row,
+                    end_row_offset_idx=excel_cell.row + excel_cell.row_span,
+                    start_col_offset_idx=excel_cell.col,
+                    end_col_offset_idx=excel_cell.col + excel_cell.col_span,
+                    col_header=False,
+                    row_header=False,
+                )
+                table_data.table_cells.append(cell)
+            doc.add_table(data=table_data, parent=self.parents[0])
+        return doc
+    def _find_data_tables(self, sheet: Worksheet):
+        """
+        Find all compact rectangular data tables in a sheet.
+        """
+        # _log.info("find_data_tables")
+        tables = []  # List to store found tables
+        visited: set[Tuple[int, int]] = set()  # Track already visited cells
+        # Iterate over all cells in the sheet
+        for ri, row in enumerate(sheet.iter_rows(values_only=False)):
+            for rj, cell in enumerate(row):
+                # Skip empty or already visited cells
+                if cell.value is None or (ri, rj) in visited:
+                    continue
+                # If the cell starts a new table, find its bounds
+                table_bounds, visited_cells = self._find_table_bounds(
+                    sheet, ri, rj, visited
+                )
+                visited.update(visited_cells)  # Mark these cells as visited
+                tables.append(table_bounds)
+        return tables
+    def _find_table_bounds(
+        self,
+        sheet: Worksheet,
+        start_row: int,
+        start_col: int,
+        visited: set[Tuple[int, int]],
+    ):
+        """
+        Determine the bounds of a compact rectangular table.
+        Returns:
+        - A dictionary with the bounds and data.
+        - A set of visited cell coordinates.
+        """
+        _log.info("find_table_bounds")
+        max_row = self._find_table_bottom(sheet, start_row, start_col)
+        max_col = self._find_table_right(sheet, start_row, start_col)
+        # Collect the data within the bounds
+        data = []
+        visited_cells = set()
+        for ri in range(start_row, max_row + 1):
+            for rj in range(start_col, max_col + 1):
+                cell = sheet.cell(row=ri + 1, column=rj + 1)  # 1-based indexing
+                # Check if the cell belongs to a merged range
+                row_span = 1
+                col_span = 1
+                # _log.info(sheet.merged_cells.ranges)
+                for merged_range in sheet.merged_cells.ranges:
+                    if (
+                        merged_range.min_row <= ri + 1
+                        and ri + 1 <= merged_range.max_row
+                        and merged_range.min_col <= rj + 1
+                        and rj + 1 <= merged_range.max_col
+                    ):
+                        row_span = merged_range.max_row - merged_range.min_row + 1
+                        col_span = merged_range.max_col - merged_range.min_col + 1
+                        break
+                if (ri, rj) not in visited_cells:
+                    data.append(
+                        ExcelCell(
+                            row=ri - start_row,
+                            col=rj - start_col,
+                            text=str(cell.value),
+                            row_span=row_span,
+                            col_span=col_span,
+                        )
+                    )
+                    # _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
+                    # Mark all cells in the span as visited
+                    for span_row in range(ri, ri + row_span):
+                        for span_col in range(rj, rj + col_span):
+                            visited_cells.add((span_row, span_col))
+        return (
+            ExcelTable(
+                num_rows=max_row + 1 - start_row,
+                num_cols=max_col + 1 - start_col,
+                data=data,
+            ),
+            visited_cells,
+        )
+    def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int):
+        """Function to find the bottom boundary of the table"""
+        max_row = start_row
+        while max_row < sheet.max_row - 1:
+            # Get the cell value or check if it is part of a merged cell
+            cell = sheet.cell(row=max_row + 2, column=start_col + 1)
+            # Check if the cell is part of a merged range
+            merged_range = next(
+                (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
+                None,
+            )
+            if cell.value is None and not merged_range:
+                break  # Stop if the cell is empty and not merged
+            # Expand max_row to include the merged range if applicable
+            if merged_range:
+                max_row = max(max_row, merged_range.max_row - 1)
+            else:
+                max_row += 1
+        return max_row
+    def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int):
+        """Function to find the right boundary of the table"""
+        max_col = start_col
+        while max_col < sheet.max_column - 1:
+            # Get the cell value or check if it is part of a merged cell
+            cell = sheet.cell(row=start_row + 1, column=max_col + 2)
+            # Check if the cell is part of a merged range
+            merged_range = next(
+                (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
+                None,
+            )
+            if cell.value is None and not merged_range:
+                break  # Stop if the cell is empty and not merged
+            # Expand max_col to include the merged range if applicable
+            if merged_range:
+                max_col = max(max_col, merged_range.max_col - 1)
+            else:
+                max_col += 1
+        return max_col
+    def _find_images_in_sheet(
+        self, doc: DoclingDocument, sheet: Worksheet
+    ) -> DoclingDocument:
+        # FIXME: mypy does not agree with _images ...
+        """
+        # Iterate over images in the sheet
+        for idx, image in enumerate(sheet._images):  # Access embedded images
+            image_bytes = BytesIO(image.ref.blob)
+            pil_image = Image.open(image_bytes)
+            doc.add_picture(
+                parent=self.parents[0],
+                image=ImageRef.from_pil(image=pil_image, dpi=72),
+                caption=None,
+            )
+        """
+        # FIXME: mypy does not agree with _charts ...
+        """
+        for idx, chart in enumerate(sheet._charts):  # Access embedded charts
+            chart_path = f"chart_{idx + 1}.png"
+            _log.info(
+                f"Chart found, but dynamic rendering is required for: {chart_path}"
+            )
+            _log.info(f"Chart {idx + 1}:")
+            # Chart type
+            _log.info(f"Type: {type(chart).__name__}")
+            # Title
+            if chart.title:
+                _log.info(f"Title: {chart.title}")
+            else:
+                _log.info("No title")
+            # Data series
+            for series in chart.series:
+                _log.info(" => series ...")
+                _log.info(f"Data Series: {series.title}")
+                _log.info(f"Values: {series.values}")
+                _log.info(f"Categories: {series.categories}")
+            # Position
+            # _log.info(f"Anchor Cell: {chart.anchor}")
+        """
+        return doc

{docling-2.5.2 → docling-2.6.0}/docling/backend/mspowerpoint_backend.py RENAMED Viewed

@@ -10,11 +10,13 @@ from docling_core.types.doc import (
     DoclingDocument,
     DocumentOrigin,
     GroupLabel,
+    ImageRef,
     ProvenanceItem,
     Size,
     TableCell,
     TableData,
 )
+from PIL import Image
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@@ -268,9 +270,22 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         return
     def handle_pictures(self, shape, parent_slide, slide_ind, doc):
+        # Get the image bytes
+        image = shape.image
+        image_bytes = image.blob
+        im_dpi, _ = image.dpi
+        # Open it with PIL
+        pil_image = Image.open(BytesIO(image_bytes))
         # shape has picture
         prov = self.generate_prov(shape, slide_ind, "")
-        doc.add_picture(parent=parent_slide, caption=None, prov=prov)
+        doc.add_picture(
+            parent=parent_slide,
+            image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
+            caption=None,
+            prov=prov,
+        )
         return
     def handle_tables(self, shape, parent_slide, slide_ind, doc):

{docling-2.5.2 → docling-2.6.0}/docling/backend/msword_backend.py RENAMED Viewed

@@ -9,10 +9,12 @@ from docling_core.types.doc import (
     DoclingDocument,
     DocumentOrigin,
     GroupLabel,
+    ImageRef,
     TableCell,
     TableData,
 )
 from lxml import etree
+from PIL import Image
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@@ -130,13 +132,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
         for element in body:
             tag_name = etree.QName(element).localname
-            # Check for Inline Images (drawings or blip elements)
-            found_drawing = etree.ElementBase.xpath(
-                element, ".//w:drawing", namespaces=self.xml_namespaces
-            )
-            found_pict = etree.ElementBase.xpath(
-                element, ".//w:pict", namespaces=self.xml_namespaces
-            )
+            # Check for Inline Images (blip elements)
+            drawing_blip = element.xpath(".//a:blip")
             # Check for Tables
             if element.tag.endswith("tbl"):
@@ -145,8 +142,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 except Exception:
                     _log.debug("could not parse a table, broken docx table")
-            elif found_drawing or found_pict:
-                self.handle_pictures(element, docx_obj, doc)
+            elif drawing_blip:
+                self.handle_pictures(element, docx_obj, drawing_blip, doc)
             # Check for Text
             elif tag_name in ["p"]:
                 self.handle_text_elements(element, docx_obj, doc)
@@ -491,6 +488,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         doc.add_table(data=data, parent=self.parents[level - 1])
         return
-    def handle_pictures(self, element, docx_obj, doc):
-        doc.add_picture(parent=self.parents[self.level], caption=None)
+    def handle_pictures(self, element, docx_obj, drawing_blip, doc):
+        def get_docx_image(element, drawing_blip):
+            rId = drawing_blip[0].get(
+                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
+            )
+            if rId in docx_obj.part.rels:
+                # Access the image part using the relationship ID
+                image_part = docx_obj.part.rels[rId].target_part
+                image_data = image_part.blob  # Get the binary image data
+            return image_data
+        image_data = get_docx_image(element, drawing_blip)
+        image_bytes = BytesIO(image_data)
+        # Open the BytesIO object with PIL to create an Image
+        pil_image = Image.open(image_bytes)
+        doc.add_picture(
+            parent=self.parents[self.level],
+            image=ImageRef.from_pil(image=pil_image, dpi=72),
+            caption=None,
+        )
         return

{docling-2.5.2 → docling-2.6.0}/docling/cli/main.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import importlib
 import json
 import logging
+import re
 import time
 import warnings
 from enum import Enum
@@ -129,6 +130,12 @@ def export_documents(
     )
+def _split_list(raw: Optional[str]) -> Optional[List[str]]:
+    if raw is None:
+        return None
+    return re.split(r"[;,]", raw)
 @app.command(no_args_is_help=True)
 def convert(
     input_sources: Annotated[
@@ -163,6 +170,13 @@ def convert(
     ocr_engine: Annotated[
         OcrEngine, typer.Option(..., help="The OCR engine to use.")
     ] = OcrEngine.EASYOCR,
+    ocr_lang: Annotated[
+        Optional[str],
+        typer.Option(
+            ...,
+            help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
+        ),
+    ] = None,
     pdf_backend: Annotated[
         PdfBackend, typer.Option(..., help="The PDF backend to use.")
     ] = PdfBackend.DLPARSE_V1,
@@ -185,6 +199,15 @@ def convert(
     output: Annotated[
         Path, typer.Option(..., help="Output directory where results are saved.")
     ] = Path("."),
+    verbose: Annotated[
+        int,
+        typer.Option(
+            "--verbose",
+            "-v",
+            count=True,
+            help="Set the verbosity level. -v for info logging, -vv for debug logging.",
+        ),
+    ] = 0,
     version: Annotated[
         Optional[bool],
         typer.Option(
@@ -195,7 +218,12 @@ def convert(
         ),
     ] = None,
 ):
-    logging.basicConfig(level=logging.INFO)
+    if verbose == 0:
+        logging.basicConfig(level=logging.WARNING)
+    elif verbose == 1:
+        logging.basicConfig(level=logging.INFO)
+    elif verbose == 2:
+        logging.basicConfig(level=logging.DEBUG)
     if from_formats is None:
         from_formats = [e for e in InputFormat]
@@ -234,6 +262,10 @@ def convert(
         case _:
             raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
+    ocr_lang_list = _split_list(ocr_lang)
+    if ocr_lang_list is not None:
+        ocr_options.lang = ocr_lang_list
     pipeline_options = PdfPipelineOptions(
         do_ocr=ocr,
         ocr_options=ocr_options,
@@ -287,5 +319,7 @@ def convert(
     _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+click_app = typer.main.get_command(app)
 if __name__ == "__main__":
     app()

{docling-2.5.2 → docling-2.6.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -32,6 +32,7 @@ class InputFormat(str, Enum):
     PDF = "pdf"
     ASCIIDOC = "asciidoc"
     MD = "md"
+    XLSX = "xlsx"
 class OutputFormat(str, Enum):
@@ -49,6 +50,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.HTML: ["html", "htm", "xhtml"],
     InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
+    InputFormat.XLSX: ["xlsx"],
 }
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -72,7 +74,11 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
     InputFormat.PDF: ["application/pdf"],
     InputFormat.ASCIIDOC: ["text/asciidoc"],
     InputFormat.MD: ["text/markdown", "text/x-markdown"],
+    InputFormat.XLSX: [
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    ],
 }
 MimeTypeToFormat = {
     mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
 }

{docling-2.5.2 → docling-2.6.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
 class OcrOptions(BaseModel):
     kind: str
+    lang: List[str]
     force_full_page_ocr: bool = False  # If enabled a full page OCR is always applied
     bitmap_area_threshold: float = (
         0.05  # percentage of the area for a bitmap to processed with OCR
@@ -81,4 +82,11 @@ class PdfPipelineOptions(PipelineOptions):
     images_scale: float = 1.0
     generate_page_images: bool = False
     generate_picture_images: bool = False
-    generate_table_images: bool = False
+    generate_table_images: bool = Field(
+        default=False,
+        deprecated=(
+            "Field `generate_table_images` is deprecated. "
+            "To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
+            "before conversion and then use the `TableItem.get_image` function."
+        ),
+    )

{docling-2.5.2 → docling-2.6.0}/docling/document_converter.py RENAMED Viewed

@@ -12,6 +12,7 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
+from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
@@ -44,6 +45,11 @@ class FormatOption(BaseModel):
         return self
+class ExcelFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
 class WordFormatOption(FormatOption):
     pipeline_cls: Type = SimplePipeline
     backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
@@ -80,6 +86,9 @@ class ImageFormatOption(FormatOption):
 _format_to_default_options = {
+    InputFormat.XLSX: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
+    ),
     InputFormat.DOCX: FormatOption(
         pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
     ),

{docling-2.5.2 → docling-2.6.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.5.2"  # DO NOT EDIT, updated automatically
+version = "2.6.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -26,7 +26,7 @@ packages = [{include = "docling"}]
 ######################
 python = "^3.10"
 pydantic = "^2.0.0"
-docling-core = "^2.3.0"
+docling-core = "^2.4.0"
 docling-ibm-models = "^2.0.3"
 deepsearch-glm = "^0.26.1"
 filetype = "^1.2.0"
@@ -47,6 +47,7 @@ python-pptx = "^1.0.2"
 beautifulsoup4 = "^4.12.3"
 pandas = "^2.1.4"
 marko = "^2.1.2"
+openpyxl = "^3.1.5"
 [tool.poetry.group.dev.dependencies]
 black = {extras = ["jupyter"], version = "^24.4.2"}
@@ -65,10 +66,12 @@ pandas-stubs = "^2.1.4.231227"
 ipykernel = "^6.29.5"
 ipywidgets = "^8.1.5"
 nbqa = "^1.9.0"
+types-openpyxl = "^3.1.5.20241114"
 [tool.poetry.group.docs.dependencies]
 mkdocs-material = "^9.5.40"
 mkdocs-jupyter = "^0.25.0"
+mkdocs-click = "^0.8.1"
 [tool.poetry.group.examples.dependencies]
 datasets = "^2.21.0"