PyPI - docling - Versions diffs - 1.8.4__py3-none-any.whl → 1.9.0__py3-none-any.whl - Mend

docling 1.8.4py3-none-any.whl → 1.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

docling/datamodel/base_models.py +12 -3
docling/models/ds_glm_model.py +5 -1
docling/models/table_structure_model.py +10 -1
docling/utils/export.py +193 -0
{docling-1.8.4.dist-info → docling-1.9.0.dist-info}/METADATA +7 -4
{docling-1.8.4.dist-info → docling-1.9.0.dist-info}/RECORD +8 -7
{docling-1.8.4.dist-info → docling-1.9.0.dist-info}/LICENSE +0 -0
{docling-1.8.4.dist-info → docling-1.9.0.dist-info}/WHEEL +0 -0

docling/datamodel/base_models.py CHANGED Viewed

@@ -71,6 +71,15 @@ class BoundingBox(BaseModel):
         return out_bbox
+    def normalized(self, page_size: PageSize) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l /= page_size.width
+        out_bbox.r /= page_size.width
+        out_bbox.t /= page_size.height
+        out_bbox.b /= page_size.height
+        return out_bbox
     def as_tuple(self):
         if self.coord_origin == CoordOrigin.TOPLEFT:
             return (self.l, self.t, self.r, self.b)
@@ -238,9 +247,9 @@ class EquationPrediction(BaseModel):
 class PagePredictions(BaseModel):
     layout: LayoutPrediction = None
-    tablestructure: TableStructurePrediction = None
-    figures_classification: FigureClassificationPrediction = None
-    equations_prediction: EquationPrediction = None
+    tablestructure: Optional[TableStructurePrediction] = None
+    figures_classification: Optional[FigureClassificationPrediction] = None
+    equations_prediction: Optional[EquationPrediction] = None
 PageElement = Union[TextElement, TableElement, FigureElement]

docling/models/ds_glm_model.py CHANGED Viewed

@@ -16,8 +16,12 @@ from docling.datamodel.document import ConversionResult
 class GlmModel:
     def __init__(self, config):
         self.config = config
+        self.model_names = self.config.get(
+            "model_names", ""
+        )  # "language;term;reference"
         load_pretrained_nlp_models()
-        model = init_nlp_model(model_names="language;term;reference")
+        # model = init_nlp_model(model_names="language;term;reference")
+        model = init_nlp_model(model_names=self.model_names)
         self.model = model
     def __call__(self, conv_res: ConversionResult) -> DsDocument:

docling/models/table_structure_model.py CHANGED Viewed

@@ -44,7 +44,16 @@ class TableStructureModel:
             for tc in table_element.table_cells:
                 x0, y0, x1, y1 = tc.bbox.as_tuple()
-                draw.rectangle([(x0, y0), (x1, y1)], outline="blue")
+                if tc.column_header:
+                    width = 3
+                else:
+                    width = 1
+                draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
+                draw.text(
+                    (x0 + 3, y0 + 3),
+                    text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
+                    fill="black",
+                )
         image.show()

docling/utils/export.py ADDED Viewed

@@ -0,0 +1,193 @@
+import logging
+from typing import Any, Dict, Iterable, List, Tuple
+from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
+from docling.datamodel.document import ConvertedDocument, Page
+_log = logging.getLogger(__name__)
+def _export_table_to_html(table: Table):
+    # TODO: this is flagged as internal, because we will move it
+    # to the docling-core package.
+    def _get_tablecell_span(cell: TableCell, ix):
+        span = set([s[ix] for s in cell.spans])
+        if len(span) == 0:
+            return 1, None, None
+        return len(span), min(span), max(span)
+    body = ""
+    nrows = table.num_rows
+    ncols = table.num_cols
+    for i in range(nrows):
+        body += "<tr>"
+        for j in range(ncols):
+            cell: TableCell = table.data[i][j]
+            rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
+            colspan, colstart, colend = _get_tablecell_span(cell, 1)
+            if rowstart is not None and rowstart != i:
+                continue
+            if colstart is not None and colstart != j:
+                continue
+            if rowstart is None:
+                rowstart = i
+            if colstart is None:
+                colstart = j
+            content = cell.text.strip()
+            label = cell.obj_type
+            label_class = "body"
+            celltag = "td"
+            if label in ["row_header", "row_multi_header", "row_title"]:
+                label_class = "header"
+            elif label in ["col_header", "col_multi_header"]:
+                label_class = "header"
+                celltag = "th"
+            opening_tag = f"{celltag}"
+            if rowspan > 1:
+                opening_tag += f' rowspan="{rowspan}"'
+            if colspan > 1:
+                opening_tag += f' colspan="{colspan}"'
+            body += f"<{opening_tag}>{content}</{celltag}>"
+        body += "</tr>"
+    body = f"<table>{body}</table>"
+    return body
+def generate_multimodal_pages(
+    doc_result: ConvertedDocument,
+) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
+    label_to_doclaynet = {
+        "title": "title",
+        "table-of-contents": "document_index",
+        "subtitle-level-1": "section_header",
+        "checkbox-selected": "checkbox_selected",
+        "checkbox-unselected": "checkbox_unselected",
+        "caption": "caption",
+        "page-header": "page_header",
+        "page-footer": "page_footer",
+        "footnote": "footnote",
+        "table": "table",
+        "formula": "formula",
+        "list-item": "list_item",
+        "code": "code",
+        "figure": "picture",
+        "picture": "picture",
+        "reference": "text",
+        "paragraph": "text",
+        "text": "text",
+    }
+    content_text = ""
+    page_no = 0
+    start_ix = 0
+    end_ix = 0
+    doc_items = []
+    doc = doc_result.output
+    def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
+        segments = []
+        for ix, item in doc_items:
+            item_type = item.obj_type
+            label = label_to_doclaynet.get(item_type, None)
+            if label is None:
+                continue
+            bbox = BoundingBox.from_tuple(
+                item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
+            )
+            new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
+                page_size=page.size
+            )
+            new_segment = {
+                "index_in_doc": ix,
+                "label": label,
+                "text": item.text if item.text is not None else "",
+                "bbox": new_bbox.as_tuple(),
+                "data": [],
+            }
+            if isinstance(item, Table):
+                table_html = _export_table_to_html(item)
+                new_segment["data"].append(
+                    {
+                        "html_seq": table_html,
+                        "otsl_seq": "",
+                    }
+                )
+            segments.append(new_segment)
+        return segments
+    def _process_page_cells(page: Page):
+        cells = []
+        for cell in page.cells:
+            new_bbox = cell.bbox.to_top_left_origin(
+                page_height=page.size.height
+            ).normalized(page_size=page.size)
+            is_ocr = isinstance(cell, OcrCell)
+            ocr_confidence = cell.confidence if is_ocr else 1.0
+            cells.append(
+                {
+                    "text": cell.text,
+                    "bbox": new_bbox.as_tuple(),
+                    "ocr": is_ocr,
+                    "ocr_confidence": ocr_confidence,
+                }
+            )
+        return cells
+    def _process_page():
+        page_ix = page_no - 1
+        page = doc_result.pages[page_ix]
+        page_cells = _process_page_cells(page=page)
+        page_segments = _process_page_segments(doc_items=doc_items, page=page)
+        content_md = doc.export_to_markdown(
+            main_text_start=start_ix, main_text_stop=end_ix
+        )
+        return content_text, content_md, page_cells, page_segments, page
+    for ix, orig_item in enumerate(doc.main_text):
+        item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
+        if item is None or item.prov is None or len(item.prov) == 0:
+            _log.debug(f"Skipping item {orig_item}")
+            continue
+        item_page = item.prov[0].page
+        # Page is complete
+        if page_no > 0 and item_page > page_no:
+            yield _process_page()
+            start_ix = ix
+            doc_items = []
+            content_text = ""
+        page_no = item_page
+        end_ix = ix
+        doc_items.append((ix, item))
+        if item.text is not None and item.text != "":
+            content_text += item.text + " "
+    if len(doc_items) > 0:
+        yield _process_page()

{docling-1.8.4.dist-info → docling-1.9.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.8.4
+Version: 1.9.0
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -20,13 +20,14 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Dist: certifi (>=2024.7.4)
-Requires-Dist: deepsearch-glm (>=0.19.0,<1)
-Requires-Dist: docling-core (>=1.1.2,<2.0.0)
+Requires-Dist: deepsearch-glm (>=0.19.1,<0.20.0)
+Requires-Dist: docling-core (>=1.1.3,<2.0.0)
 Requires-Dist: docling-ibm-models (>=1.1.3,<2.0.0)
-Requires-Dist: docling-parse (>=1.1.1,<2.0.0)
+Requires-Dist: docling-parse (>=1.1.3,<2.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
+Requires-Dist: pyarrow (>=17.0.0,<18.0.0)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
 Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
@@ -62,6 +63,8 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
 * 📝 Extracts metadata from the document, such as title, authors, references and language
 * 🔍 Optionally applies OCR (use with scanned PDFs)
+Doing RAG or Q/A? Also consider [Quackling](https://github.com/DS4SD/quackling) to get the most out of your documents.
 ## Installation
 To use Docling, simply install `docling` from your package manager, e.g. pip:

{docling-1.8.4.dist-info → docling-1.9.0.dist-info}/RECORD RENAMED Viewed

@@ -4,24 +4,25 @@ docling/backend/abstract_backend.py,sha256=xfNNiZKksPPa9KAiA-fHD86flg0It4n_29ccp
 docling/backend/docling_parse_backend.py,sha256=r3aJwsWR7qG47ElhOa9iQJJQauHMt950FfCsf6fhlP4,7480
 docling/backend/pypdfium2_backend.py,sha256=FggVFitmyMMmLar6vk6XQsavGOPQx95TD14opWYRMAY,8837
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/datamodel/base_models.py,sha256=F3iF7cQRvdO5RaPnrXballaTvnWkPTXnX-n9N4cpCGo,8842
+docling/datamodel/base_models.py,sha256=PSJe_Qlh2VJfijg3kkXOOqZbi_uqRHCmLjX__c5Buck,9155
 docling/datamodel/document.py,sha256=cG9RuAkFXCCGZqCHmhUtYeOA5PV6gjO3Y4i5lf2IM6I,13649
 docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
 docling/document_converter.py,sha256=5OiNafoaVcQhZ8ATF69xRp2KyFyKeSMhmwEFUoCzP-k,10980
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
-docling/models/ds_glm_model.py,sha256=BszxBcUZPUFgDqngGLbS5pSRyOCkPxRrCi4zP7Vm8DY,3191
+docling/models/ds_glm_model.py,sha256=inNsmlriiDuqe3Q4LWL2DbqPTScP-3-dFgFoaJprFtQ,3367
 docling/models/easyocr_model.py,sha256=ABIqALvtNNrDQ47fXaZ0lDFhOwKsYGUUlAPnIsFZgZA,2232
 docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
 docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
-docling/models/table_structure_model.py,sha256=5jzTlpM-GdCSq4l0vD1W6aSPTJXeTcXEnNuPxnw-DlA,5437
+docling/models/table_structure_model.py,sha256=0wOeiRoma6et7FtoJZw2SA3wBd9-R9ivp5uvXBQqeM4,5768
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/pipeline/base_model_pipeline.py,sha256=AC5NTR0xLy5JIZqsTINkKEHeCPqpyvJpuE_bcnZhyvI,529
 docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjgvo_Fl2dfBVnRQs,1442
 docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/utils/export.py,sha256=gP8609DtHp6bNGPhYpwe0g3J4qvc2HqQpHZnfl7hQZQ,5899
 docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-1.8.4.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
-docling-1.8.4.dist-info/METADATA,sha256=BfG2nwCktriHJ6k_NMw07Q0OmfNBOaY1V2_bFLd_AZA,7883
-docling-1.8.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-1.8.4.dist-info/RECORD,,
+docling-1.9.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
+docling-1.9.0.dist-info/METADATA,sha256=YV5QVsWcEyeDIYezvMWyFg7csluluDQ2xT7LLT1J6Qg,8051
+docling-1.9.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-1.9.0.dist-info/RECORD,,

{docling-1.8.4.dist-info → docling-1.9.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-1.8.4.dist-info → docling-1.9.0.dist-info}/WHEEL RENAMED Viewed

File without changes

docling 1.8.4__py3-none-any.whl → 1.9.0__py3-none-any.whl

docling 1.8.4py3-none-any.whl → 1.9.0py3-none-any.whl