PyPI - docling - Versions diffs - 1.6.2__py3-none-any.whl → 1.19.0__py3-none-any.whl - Mend

docling 1.6.2py3-none-any.whl → 1.19.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

docling/backend/abstract_backend.py +17 -8
docling/backend/docling_parse_backend.py +42 -26
docling/backend/pypdfium2_backend.py +33 -11
docling/cli/__init__.py +0 -0
docling/cli/main.py +253 -0
docling/datamodel/base_models.py +39 -27
docling/datamodel/document.py +115 -17
docling/datamodel/pipeline_options.py +67 -0
docling/document_converter.py +65 -44
docling/models/base_ocr_model.py +4 -4
docling/models/ds_glm_model.py +11 -7
docling/models/easyocr_model.py +19 -4
docling/models/layout_model.py +3 -3
docling/models/table_structure_model.py +18 -2
docling/models/tesseract_ocr_cli_model.py +167 -0
docling/models/tesseract_ocr_model.py +122 -0
docling/pipeline/base_model_pipeline.py +4 -3
docling/pipeline/standard_model_pipeline.py +36 -8
docling/utils/export.py +145 -0
{docling-1.6.2.dist-info → docling-1.19.0.dist-info}/LICENSE +1 -1
docling-1.19.0.dist-info/METADATA +380 -0
docling-1.19.0.dist-info/RECORD +34 -0
docling-1.19.0.dist-info/entry_points.txt +3 -0
docling-1.6.2.dist-info/METADATA +0 -192
docling-1.6.2.dist-info/RECORD +0 -27
{docling-1.6.2.dist-info → docling-1.19.0.dist-info}/WHEEL +0 -0

docling/models/table_structure_model.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import copy
+from pathlib import Path
 from typing import Iterable, List
 import numpy
@@ -12,16 +13,22 @@ from docling.datamodel.base_models import (
     TableElement,
     TableStructurePrediction,
 )
+from docling.datamodel.pipeline_options import TableFormerMode
 class TableStructureModel:
     def __init__(self, config):
         self.config = config
         self.do_cell_matching = config["do_cell_matching"]
+        self.mode = config["mode"]
         self.enabled = config["enabled"]
         if self.enabled:
-            artifacts_path = config["artifacts_path"]
+            artifacts_path: Path = config["artifacts_path"]
+            if self.mode == TableFormerMode.ACCURATE:
+                artifacts_path = artifacts_path / "fat"
             # Third Party
             import docling_ibm_models.tableformer.common as c
@@ -44,7 +51,16 @@ class TableStructureModel:
             for tc in table_element.table_cells:
                 x0, y0, x1, y1 = tc.bbox.as_tuple()
-                draw.rectangle([(x0, y0), (x1, y1)], outline="blue")
+                if tc.column_header:
+                    width = 3
+                else:
+                    width = 1
+                draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
+                draw.text(
+                    (x0 + 3, y0 + 3),
+                    text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
+                    fill="black",
+                )
         image.show()

docling/models/tesseract_ocr_cli_model.py ADDED Viewed

@@ -0,0 +1,167 @@
+import io
+import logging
+import tempfile
+from subprocess import PIPE, Popen
+from typing import Iterable, Tuple
+import pandas as pd
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docling.models.base_ocr_model import BaseOcrModel
+_log = logging.getLogger(__name__)
+class TesseractOcrCliModel(BaseOcrModel):
+    def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: TesseractCliOcrOptions
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+        self._name = None
+        self._version = None
+        if self.enabled:
+            try:
+                self._get_name_and_version()
+            except Exception as exc:
+                raise RuntimeError(
+                    f"Tesseract is not available, aborting: {exc} "
+                    "Install tesseract on your system and the tesseract binary is discoverable. "
+                    "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
+                    "Alternatively, Docling has support for other OCR engines. See the documentation."
+                )
+    def _get_name_and_version(self) -> Tuple[str, str]:
+        if self._name != None and self._version != None:
+            return self._name, self._version
+        cmd = [self.options.tesseract_cmd, "--version"]
+        proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
+        stdout, stderr = proc.communicate()
+        proc.wait()
+        # HACK: Windows versions of Tesseract output the version to stdout, Linux versions
+        # to stderr, so check both.
+        version_line = (
+            (stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
+            .split("\n")[0]
+            .strip()
+        )
+        # If everything else fails...
+        if not version_line:
+            version_line = "tesseract XXX"
+        name, version = version_line.split(" ")
+        self._name = name
+        self._version = version
+        return name, version
+    def _run_tesseract(self, ifilename: str):
+        cmd = [self.options.tesseract_cmd]
+        if self.options.lang is not None and len(self.options.lang) > 0:
+            cmd.append("-l")
+            cmd.append("+".join(self.options.lang))
+        if self.options.path is not None:
+            cmd.append("--tessdata-dir")
+            cmd.append(self.options.path)
+        cmd += [ifilename, "stdout", "tsv"]
+        _log.info("command: {}".format(" ".join(cmd)))
+        proc = Popen(cmd, stdout=PIPE)
+        output, _ = proc.communicate()
+        # _log.info(output)
+        # Decode the byte string to a regular string
+        decoded_data = output.decode("utf-8")
+        # _log.info(decoded_data)
+        # Read the TSV file generated by Tesseract
+        df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
+        # Display the dataframe (optional)
+        # _log.info("df: ", df.head())
+        # Filter rows that contain actual text (ignore header or empty rows)
+        df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
+        return df_filtered
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        if not self.enabled:
+            yield from page_batch
+            return
+        for page in page_batch:
+            ocr_rects = self.get_ocr_rects(page)
+            all_ocr_cells = []
+            for ocr_rect in ocr_rects:
+                # Skip zero area boxes
+                if ocr_rect.area() == 0:
+                    continue
+                high_res_image = page._backend.get_page_image(
+                    scale=self.scale, cropbox=ocr_rect
+                )
+                with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
+                    fname = image_file.name
+                    high_res_image.save(fname)
+                    df = self._run_tesseract(fname)
+                # _log.info(df)
+                # Print relevant columns (bounding box and text)
+                for ix, row in df.iterrows():
+                    text = row["text"]
+                    conf = row["conf"]
+                    l = float(row["left"])
+                    b = float(row["top"])
+                    w = float(row["width"])
+                    h = float(row["height"])
+                    t = b + h
+                    r = l + w
+                    cell = OcrCell(
+                        id=ix,
+                        text=text,
+                        confidence=conf / 100.0,
+                        bbox=BoundingBox.from_tuple(
+                            coord=(
+                                (l / self.scale) + ocr_rect.l,
+                                (b / self.scale) + ocr_rect.t,
+                                (r / self.scale) + ocr_rect.l,
+                                (t / self.scale) + ocr_rect.t,
+                            ),
+                            origin=CoordOrigin.TOPLEFT,
+                        ),
+                    )
+                    all_ocr_cells.append(cell)
+            ## Remove OCR cells which overlap with programmatic cells.
+            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+            page.cells.extend(filtered_ocr_cells)
+            # DEBUG code:
+            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+            yield page

docling/models/tesseract_ocr_model.py ADDED Viewed

@@ -0,0 +1,122 @@
+import logging
+from typing import Iterable
+import numpy
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docling.models.base_ocr_model import BaseOcrModel
+_log = logging.getLogger(__name__)
+class TesseractOcrModel(BaseOcrModel):
+    def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: TesseractCliOcrOptions
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+        self.reader = None
+        if self.enabled:
+            setup_errmsg = (
+                "tesserocr is not correctly installed. "
+                "Please install it via `pip install tesserocr` to use this OCR engine. "
+                "Note that tesserocr might have to be manually compiled for working with"
+                "your Tesseract installation. The Docling documentation provides examples for it. "
+                "Alternatively, Docling has support for other OCR engines. See the documentation."
+            )
+            try:
+                import tesserocr
+            except ImportError:
+                raise ImportError(setup_errmsg)
+            try:
+                tesseract_version = tesserocr.tesseract_version()
+                _log.debug("Initializing TesserOCR: %s", tesseract_version)
+            except:
+                raise ImportError(setup_errmsg)
+            # Initialize the tesseractAPI
+            lang = "+".join(self.options.lang)
+            if self.options.path is not None:
+                self.reader = tesserocr.PyTessBaseAPI(
+                    path=self.options.path,
+                    lang=lang,
+                    psm=tesserocr.PSM.AUTO,
+                    init=True,
+                    oem=tesserocr.OEM.DEFAULT,
+                )
+            else:
+                self.reader = tesserocr.PyTessBaseAPI(
+                    lang=lang,
+                    psm=tesserocr.PSM.AUTO,
+                    init=True,
+                    oem=tesserocr.OEM.DEFAULT,
+                )
+            self.reader_RIL = tesserocr.RIL
+    def __del__(self):
+        if self.reader is not None:
+            # Finalize the tesseractAPI
+            self.reader.End()
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        if not self.enabled:
+            yield from page_batch
+            return
+        for page in page_batch:
+            ocr_rects = self.get_ocr_rects(page)
+            all_ocr_cells = []
+            for ocr_rect in ocr_rects:
+                # Skip zero area boxes
+                if ocr_rect.area() == 0:
+                    continue
+                high_res_image = page._backend.get_page_image(
+                    scale=self.scale, cropbox=ocr_rect
+                )
+                # Retrieve text snippets with their bounding boxes
+                self.reader.SetImage(high_res_image)
+                boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
+                cells = []
+                for ix, (im, box, _, _) in enumerate(boxes):
+                    # Set the area of interest. Tesseract uses Bottom-Left for the origin
+                    self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
+                    # Extract text within the bounding box
+                    text = self.reader.GetUTF8Text().strip()
+                    confidence = self.reader.MeanTextConf()
+                    left = box["x"] / self.scale
+                    bottom = box["y"] / self.scale
+                    right = (box["x"] + box["w"]) / self.scale
+                    top = (box["y"] + box["h"]) / self.scale
+                    cells.append(
+                        OcrCell(
+                            id=ix,
+                            text=text,
+                            confidence=confidence,
+                            bbox=BoundingBox.from_tuple(
+                                coord=(left, top, right, bottom),
+                                origin=CoordOrigin.TOPLEFT,
+                            ),
+                        )
+                    )
+                # del high_res_image
+                all_ocr_cells.extend(cells)
+            ## Remove OCR cells which overlap with programmatic cells.
+            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+            page.cells.extend(filtered_ocr_cells)
+            # DEBUG code:
+            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+            yield page

docling/pipeline/base_model_pipeline.py CHANGED Viewed

@@ -1,12 +1,13 @@
 from pathlib import Path
-from typing import Iterable
+from typing import Callable, Iterable, List
-from docling.datamodel.base_models import Page, PipelineOptions
+from docling.datamodel.base_models import Page
+from docling.datamodel.pipeline_options import PipelineOptions
 class BaseModelPipeline:
     def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
-        self.model_pipe = []
+        self.model_pipe: List[Callable] = []
         self.artifacts_path = artifacts_path
         self.pipeline_options = pipeline_options

docling/pipeline/standard_model_pipeline.py CHANGED Viewed

@@ -1,37 +1,65 @@
 from pathlib import Path
-from docling.datamodel.base_models import PipelineOptions
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
+from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
+from docling.models.tesseract_ocr_model import TesseractOcrModel
 from docling.pipeline.base_model_pipeline import BaseModelPipeline
 class StandardModelPipeline(BaseModelPipeline):
-    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
+    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
     _table_model_path = "model_artifacts/tableformer"
     def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
         super().__init__(artifacts_path, pipeline_options)
+        ocr_model: BaseOcrModel
+        if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
+            ocr_model = EasyOcrModel(
+                enabled=pipeline_options.do_ocr,
+                options=pipeline_options.ocr_options,
+            )
+        elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
+            ocr_model = TesseractOcrCliModel(
+                enabled=pipeline_options.do_ocr,
+                options=pipeline_options.ocr_options,
+            )
+        elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
+            ocr_model = TesseractOcrModel(
+                enabled=pipeline_options.do_ocr,
+                options=pipeline_options.ocr_options,
+            )
+        else:
+            raise RuntimeError(
+                f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
+            )
         self.model_pipe = [
-            EasyOcrModel(
-                config={
-                    "lang": ["fr", "de", "es", "en"],
-                    "enabled": pipeline_options.do_ocr,
-                }
-            ),
+            # OCR
+            ocr_model,
+            # Layout
             LayoutModel(
                 config={
                     "artifacts_path": artifacts_path
                     / StandardModelPipeline._layout_model_path
                 }
             ),
+            # Table structure
             TableStructureModel(
                 config={
                     "artifacts_path": artifacts_path
                     / StandardModelPipeline._table_model_path,
                     "enabled": pipeline_options.do_table_structure,
+                    "mode": pipeline_options.table_structure_options.mode,
                     "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
                 }
             ),

docling/utils/export.py ADDED Viewed

@@ -0,0 +1,145 @@
+import logging
+from typing import Any, Dict, Iterable, List, Tuple, Union
+from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
+from docling.datamodel.document import ConversionResult, Page
+_log = logging.getLogger(__name__)
+def generate_multimodal_pages(
+    doc_result: ConversionResult,
+) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
+    label_to_doclaynet = {
+        "title": "title",
+        "table-of-contents": "document_index",
+        "subtitle-level-1": "section_header",
+        "checkbox-selected": "checkbox_selected",
+        "checkbox-unselected": "checkbox_unselected",
+        "caption": "caption",
+        "page-header": "page_header",
+        "page-footer": "page_footer",
+        "footnote": "footnote",
+        "table": "table",
+        "formula": "formula",
+        "list-item": "list_item",
+        "code": "code",
+        "figure": "picture",
+        "picture": "picture",
+        "reference": "text",
+        "paragraph": "text",
+        "text": "text",
+    }
+    content_text = ""
+    page_no = 0
+    start_ix = 0
+    end_ix = 0
+    doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
+    doc = doc_result.output
+    def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
+        segments = []
+        for ix, item in doc_items:
+            item_type = item.obj_type
+            label = label_to_doclaynet.get(item_type, None)
+            if label is None or item.prov is None or page.size is None:
+                continue
+            bbox = BoundingBox.from_tuple(
+                tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
+            )
+            new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
+                page_size=page.size
+            )
+            new_segment = {
+                "index_in_doc": ix,
+                "label": label,
+                "text": item.text if item.text is not None else "",
+                "bbox": new_bbox.as_tuple(),
+                "data": [],
+            }
+            if isinstance(item, Table):
+                table_html = item.export_to_html()
+                new_segment["data"].append(
+                    {
+                        "html_seq": table_html,
+                        "otsl_seq": "",
+                    }
+                )
+            segments.append(new_segment)
+        return segments
+    def _process_page_cells(page: Page):
+        cells: List[dict] = []
+        if page.size is None:
+            return cells
+        for cell in page.cells:
+            new_bbox = cell.bbox.to_top_left_origin(
+                page_height=page.size.height
+            ).normalized(page_size=page.size)
+            is_ocr = isinstance(cell, OcrCell)
+            ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
+            cells.append(
+                {
+                    "text": cell.text,
+                    "bbox": new_bbox.as_tuple(),
+                    "ocr": is_ocr,
+                    "ocr_confidence": ocr_confidence,
+                }
+            )
+        return cells
+    def _process_page():
+        page_ix = page_no - 1
+        page = doc_result.pages[page_ix]
+        page_cells = _process_page_cells(page=page)
+        page_segments = _process_page_segments(doc_items=doc_items, page=page)
+        content_md = doc.export_to_markdown(
+            main_text_start=start_ix, main_text_stop=end_ix
+        )
+        # No page-tagging since we only do 1 page at the time
+        content_dt = doc.export_to_document_tokens(
+            main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
+        )
+        return content_text, content_md, content_dt, page_cells, page_segments, page
+    if doc.main_text is None:
+        return
+    for ix, orig_item in enumerate(doc.main_text):
+        item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
+        if item is None or item.prov is None or len(item.prov) == 0:
+            _log.debug(f"Skipping item {orig_item}")
+            continue
+        item_page = item.prov[0].page
+        # Page is complete
+        if page_no > 0 and item_page > page_no:
+            yield _process_page()
+            start_ix = ix
+            doc_items = []
+            content_text = ""
+        page_no = item_page
+        end_ix = ix
+        doc_items.append((ix, item))
+        if item.text is not None and item.text != "":
+            content_text += item.text + " "
+    if len(doc_items) > 0:
+        yield _process_page()

{docling-1.6.2.dist-info → docling-1.19.0.dist-info}/LICENSE RENAMED Viewed

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) [year] [fullname]
+Copyright (c) 2024 International Business Machines
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

docling 1.6.2__py3-none-any.whl → 1.19.0__py3-none-any.whl

docling 1.6.2py3-none-any.whl → 1.19.0py3-none-any.whl