PyPI - docling - Versions diffs - 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

docling 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

docling/backend/abstract_backend.py +1 -0
docling/backend/asciidoc_backend.py +435 -0
docling/backend/docling_parse_backend.py +3 -3
docling/backend/docling_parse_v2_backend.py +11 -3
docling/backend/html_backend.py +8 -1
docling/backend/md_backend.py +293 -0
docling/backend/mspowerpoint_backend.py +62 -39
docling/backend/msword_backend.py +3 -10
docling/datamodel/base_models.py +15 -9
docling/datamodel/document.py +49 -12
docling/datamodel/pipeline_options.py +3 -0
docling/document_converter.py +18 -0
docling/models/base_ocr_model.py +9 -1
docling/models/ds_glm_model.py +16 -7
docling/models/easyocr_model.py +42 -40
docling/models/layout_model.py +63 -59
docling/models/page_assemble_model.py +105 -97
docling/models/page_preprocessing_model.py +7 -3
docling/models/table_structure_model.py +94 -85
docling/models/tesseract_ocr_cli_model.py +56 -52
docling/models/tesseract_ocr_model.py +50 -45
docling/pipeline/standard_pdf_pipeline.py +7 -7
{docling-2.0.0.dist-info → docling-2.2.0.dist-info}/METADATA +10 -9
docling-2.2.0.dist-info/RECORD +44 -0
docling-2.0.0.dist-info/RECORD +0 -42
{docling-2.0.0.dist-info → docling-2.2.0.dist-info}/LICENSE +0 -0
{docling-2.0.0.dist-info → docling-2.2.0.dist-info}/WHEEL +0 -0
{docling-2.0.0.dist-info → docling-2.2.0.dist-info}/entry_points.txt +0 -0

docling/models/table_structure_model.py CHANGED Viewed

@@ -71,92 +71,101 @@ class TableStructureModel(BasePageModel):
         for page in page_batch:
             assert page._backend is not None
-            assert page.predictions.layout is not None
-            assert page.size is not None
-            page.predictions.tablestructure = TableStructurePrediction()  # dummy
-            in_tables = [
-                (
-                    cluster,
-                    [
-                        round(cluster.bbox.l) * self.scale,
-                        round(cluster.bbox.t) * self.scale,
-                        round(cluster.bbox.r) * self.scale,
-                        round(cluster.bbox.b) * self.scale,
-                    ],
-                )
-                for cluster in page.predictions.layout.clusters
-                if cluster.label == DocItemLabel.TABLE
-            ]
-            if not len(in_tables):
+            if not page._backend.is_valid():
                 yield page
-                continue
-            tokens = []
-            for c in page.cells:
-                for cluster, _ in in_tables:
-                    if c.bbox.area() > 0:
-                        if (
-                            c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
-                            > 0.2
-                        ):
-                            # Only allow non empty stings (spaces) into the cells of a table
-                            if len(c.text.strip()) > 0:
-                                new_cell = copy.deepcopy(c)
-                                new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
-                                tokens.append(new_cell.model_dump())
-            page_input = {
-                "tokens": tokens,
-                "width": page.size.width * self.scale,
-                "height": page.size.height * self.scale,
-            }
-            page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
-            table_clusters, table_bboxes = zip(*in_tables)
-            if len(table_bboxes):
-                tf_output = self.tf_predictor.multi_table_predict(
-                    page_input, table_bboxes, do_matching=self.do_cell_matching
-                )
-                for table_cluster, table_out in zip(table_clusters, tf_output):
-                    table_cells = []
-                    for element in table_out["tf_responses"]:
-                        if not self.do_cell_matching:
-                            the_bbox = BoundingBox.model_validate(
-                                element["bbox"]
-                            ).scaled(1 / self.scale)
-                            text_piece = page._backend.get_text_in_rect(the_bbox)
-                            element["bbox"]["token"] = text_piece
-                        tc = TableCell.model_validate(element)
-                        if self.do_cell_matching and tc.bbox is not None:
-                            tc.bbox = tc.bbox.scaled(1 / self.scale)
-                        table_cells.append(tc)
-                    # Retrieving cols/rows, after post processing:
-                    num_rows = table_out["predict_details"]["num_rows"]
-                    num_cols = table_out["predict_details"]["num_cols"]
-                    otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
-                    tbl = Table(
-                        otsl_seq=otsl_seq,
-                        table_cells=table_cells,
-                        num_rows=num_rows,
-                        num_cols=num_cols,
-                        id=table_cluster.id,
-                        page_no=page.page_no,
-                        cluster=table_cluster,
-                        label=DocItemLabel.TABLE,
+            else:
+                assert page.predictions.layout is not None
+                assert page.size is not None
+                page.predictions.tablestructure = TableStructurePrediction()  # dummy
+                in_tables = [
+                    (
+                        cluster,
+                        [
+                            round(cluster.bbox.l) * self.scale,
+                            round(cluster.bbox.t) * self.scale,
+                            round(cluster.bbox.r) * self.scale,
+                            round(cluster.bbox.b) * self.scale,
+                        ],
+                    )
+                    for cluster in page.predictions.layout.clusters
+                    if cluster.label == DocItemLabel.TABLE
+                ]
+                if not len(in_tables):
+                    yield page
+                    continue
+                tokens = []
+                for c in page.cells:
+                    for cluster, _ in in_tables:
+                        if c.bbox.area() > 0:
+                            if (
+                                c.bbox.intersection_area_with(cluster.bbox)
+                                / c.bbox.area()
+                                > 0.2
+                            ):
+                                # Only allow non empty stings (spaces) into the cells of a table
+                                if len(c.text.strip()) > 0:
+                                    new_cell = copy.deepcopy(c)
+                                    new_cell.bbox = new_cell.bbox.scaled(
+                                        scale=self.scale
+                                    )
+                                    tokens.append(new_cell.model_dump())
+                page_input = {
+                    "tokens": tokens,
+                    "width": page.size.width * self.scale,
+                    "height": page.size.height * self.scale,
+                }
+                page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
+                table_clusters, table_bboxes = zip(*in_tables)
+                if len(table_bboxes):
+                    tf_output = self.tf_predictor.multi_table_predict(
+                        page_input, table_bboxes, do_matching=self.do_cell_matching
                     )
-                    page.predictions.tablestructure.table_map[table_cluster.id] = tbl
-                # For debugging purposes:
-                # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
+                    for table_cluster, table_out in zip(table_clusters, tf_output):
+                        table_cells = []
+                        for element in table_out["tf_responses"]:
+                            if not self.do_cell_matching:
+                                the_bbox = BoundingBox.model_validate(
+                                    element["bbox"]
+                                ).scaled(1 / self.scale)
+                                text_piece = page._backend.get_text_in_rect(the_bbox)
+                                element["bbox"]["token"] = text_piece
+                            tc = TableCell.model_validate(element)
+                            if self.do_cell_matching and tc.bbox is not None:
+                                tc.bbox = tc.bbox.scaled(1 / self.scale)
+                            table_cells.append(tc)
+                        # Retrieving cols/rows, after post processing:
+                        num_rows = table_out["predict_details"]["num_rows"]
+                        num_cols = table_out["predict_details"]["num_cols"]
+                        otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
+                        tbl = Table(
+                            otsl_seq=otsl_seq,
+                            table_cells=table_cells,
+                            num_rows=num_rows,
+                            num_cols=num_cols,
+                            id=table_cluster.id,
+                            page_no=page.page_no,
+                            cluster=table_cluster,
+                            label=DocItemLabel.TABLE,
+                        )
+                        page.predictions.tablestructure.table_map[table_cluster.id] = (
+                            tbl
+                        )
+                    # For debugging purposes:
+                    # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
-            yield page
+                yield page

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -110,61 +110,65 @@ class TesseractOcrCliModel(BaseOcrModel):
         for page in page_batch:
             assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                ocr_rects = self.get_ocr_rects(page)
+                all_ocr_cells = []
+                for ocr_rect in ocr_rects:
+                    # Skip zero area boxes
+                    if ocr_rect.area() == 0:
+                        continue
+                    high_res_image = page._backend.get_page_image(
+                        scale=self.scale, cropbox=ocr_rect
+                    )
-            ocr_rects = self.get_ocr_rects(page)
-            all_ocr_cells = []
-            for ocr_rect in ocr_rects:
-                # Skip zero area boxes
-                if ocr_rect.area() == 0:
-                    continue
-                high_res_image = page._backend.get_page_image(
-                    scale=self.scale, cropbox=ocr_rect
-                )
-                with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
-                    fname = image_file.name
-                    high_res_image.save(fname)
-                    df = self._run_tesseract(fname)
-                # _log.info(df)
-                # Print relevant columns (bounding box and text)
-                for ix, row in df.iterrows():
-                    text = row["text"]
-                    conf = row["conf"]
-                    l = float(row["left"])
-                    b = float(row["top"])
-                    w = float(row["width"])
-                    h = float(row["height"])
-                    t = b + h
-                    r = l + w
-                    cell = OcrCell(
-                        id=ix,
-                        text=text,
-                        confidence=conf / 100.0,
-                        bbox=BoundingBox.from_tuple(
-                            coord=(
-                                (l / self.scale) + ocr_rect.l,
-                                (b / self.scale) + ocr_rect.t,
-                                (r / self.scale) + ocr_rect.l,
-                                (t / self.scale) + ocr_rect.t,
+                    with tempfile.NamedTemporaryFile(
+                        suffix=".png", mode="w"
+                    ) as image_file:
+                        fname = image_file.name
+                        high_res_image.save(fname)
+                        df = self._run_tesseract(fname)
+                    # _log.info(df)
+                    # Print relevant columns (bounding box and text)
+                    for ix, row in df.iterrows():
+                        text = row["text"]
+                        conf = row["conf"]
+                        l = float(row["left"])
+                        b = float(row["top"])
+                        w = float(row["width"])
+                        h = float(row["height"])
+                        t = b + h
+                        r = l + w
+                        cell = OcrCell(
+                            id=ix,
+                            text=text,
+                            confidence=conf / 100.0,
+                            bbox=BoundingBox.from_tuple(
+                                coord=(
+                                    (l / self.scale) + ocr_rect.l,
+                                    (b / self.scale) + ocr_rect.t,
+                                    (r / self.scale) + ocr_rect.l,
+                                    (t / self.scale) + ocr_rect.t,
+                                ),
+                                origin=CoordOrigin.TOPLEFT,
                             ),
-                            origin=CoordOrigin.TOPLEFT,
-                        ),
-                    )
-                    all_ocr_cells.append(cell)
+                        )
+                        all_ocr_cells.append(cell)
-            ## Remove OCR cells which overlap with programmatic cells.
-            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                ## Remove OCR cells which overlap with programmatic cells.
+                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
-            page.cells.extend(filtered_ocr_cells)
+                page.cells.extend(filtered_ocr_cells)
-            # DEBUG code:
-            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                # DEBUG code:
+                # self.draw_ocr_rects_and_cells(page, ocr_rects)
-            yield page
+                yield page

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -69,57 +69,62 @@ class TesseractOcrModel(BaseOcrModel):
         for page in page_batch:
             assert page._backend is not None
-            assert self.reader is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                assert self.reader is not None
-            ocr_rects = self.get_ocr_rects(page)
+                ocr_rects = self.get_ocr_rects(page)
-            all_ocr_cells = []
-            for ocr_rect in ocr_rects:
-                # Skip zero area boxes
-                if ocr_rect.area() == 0:
-                    continue
-                high_res_image = page._backend.get_page_image(
-                    scale=self.scale, cropbox=ocr_rect
-                )
+                all_ocr_cells = []
+                for ocr_rect in ocr_rects:
+                    # Skip zero area boxes
+                    if ocr_rect.area() == 0:
+                        continue
+                    high_res_image = page._backend.get_page_image(
+                        scale=self.scale, cropbox=ocr_rect
+                    )
-                # Retrieve text snippets with their bounding boxes
-                self.reader.SetImage(high_res_image)
-                boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
-                cells = []
-                for ix, (im, box, _, _) in enumerate(boxes):
-                    # Set the area of interest. Tesseract uses Bottom-Left for the origin
-                    self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
-                    # Extract text within the bounding box
-                    text = self.reader.GetUTF8Text().strip()
-                    confidence = self.reader.MeanTextConf()
-                    left = box["x"] / self.scale
-                    bottom = box["y"] / self.scale
-                    right = (box["x"] + box["w"]) / self.scale
-                    top = (box["y"] + box["h"]) / self.scale
-                    cells.append(
-                        OcrCell(
-                            id=ix,
-                            text=text,
-                            confidence=confidence,
-                            bbox=BoundingBox.from_tuple(
-                                coord=(left, top, right, bottom),
-                                origin=CoordOrigin.TOPLEFT,
-                            ),
-                        )
+                    # Retrieve text snippets with their bounding boxes
+                    self.reader.SetImage(high_res_image)
+                    boxes = self.reader.GetComponentImages(
+                        self.reader_RIL.TEXTLINE, True
                     )
-                # del high_res_image
-                all_ocr_cells.extend(cells)
+                    cells = []
+                    for ix, (im, box, _, _) in enumerate(boxes):
+                        # Set the area of interest. Tesseract uses Bottom-Left for the origin
+                        self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
+                        # Extract text within the bounding box
+                        text = self.reader.GetUTF8Text().strip()
+                        confidence = self.reader.MeanTextConf()
+                        left = box["x"] / self.scale
+                        bottom = box["y"] / self.scale
+                        right = (box["x"] + box["w"]) / self.scale
+                        top = (box["y"] + box["h"]) / self.scale
+                        cells.append(
+                            OcrCell(
+                                id=ix,
+                                text=text,
+                                confidence=confidence,
+                                bbox=BoundingBox.from_tuple(
+                                    coord=(left, top, right, bottom),
+                                    origin=CoordOrigin.TOPLEFT,
+                                ),
+                            )
+                        )
+                    # del high_res_image
+                    all_ocr_cells.extend(cells)
-            ## Remove OCR cells which overlap with programmatic cells.
-            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                ## Remove OCR cells which overlap with programmatic cells.
+                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
-            page.cells.extend(filtered_ocr_cells)
+                page.cells.extend(filtered_ocr_cells)
-            # DEBUG code:
-            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                # DEBUG code:
+                # self.draw_ocr_rects_and_cells(page, ocr_rects)
-            yield page
+                yield page

docling/pipeline/standard_pdf_pipeline.py CHANGED Viewed

@@ -134,13 +134,13 @@ class StandardPdfPipeline(PaginatedPipeline):
         all_body = []
         for p in conv_res.pages:
-            assert p.assembled is not None
-            for el in p.assembled.body:
-                all_body.append(el)
-            for el in p.assembled.headers:
-                all_headers.append(el)
-            for el in p.assembled.elements:
-                all_elements.append(el)
+            if p.assembled is not None:
+                for el in p.assembled.body:
+                    all_body.append(el)
+                for el in p.assembled.headers:
+                    all_headers.append(el)
+                for el in p.assembled.elements:
+                    all_elements.append(el)
         conv_res.assembled = AssembledUnit(
             elements=all_elements, headers=all_headers, body=all_body

{docling-2.0.0.dist-info → docling-2.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.0.0
+Version: 2.2.0
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -22,13 +22,14 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
-Requires-Dist: deepsearch-glm (>=0.25.0,<0.26.0)
-Requires-Dist: docling-core (>=2.0.0,<3.0.0)
+Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
+Requires-Dist: docling-core (>=2.1.0,<3.0.0)
 Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
-Requires-Dist: docling-parse (>=1.6.0,<2.0.0)
+Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
+Requires-Dist: marko (>=2.1.2,<3.0.0)
 Requires-Dist: pandas (>=2.1.4,<3.0.0)
 Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
@@ -50,7 +51,7 @@ Description-Content-Type: text/markdown
 <p align="center">
   <a href="https://github.com/ds4sd/docling">
-    <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
+    <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
   </a>
 </p>
@@ -69,6 +70,7 @@ Description-Content-Type: text/markdown
 Docling parses documents and exports them to the desired format with ease and speed.
 ## Features
 * 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
@@ -94,16 +96,15 @@ More [detailed installation instructions](https://ds4sd.github.io/docling/instal
 ## Getting started
-To convert invidual documents, use `convert()`, for example:
+To convert individual documents, use `convert()`, for example:
 ```python
 from docling.document_converter import DocumentConverter
-source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
+source = "https://arxiv.org/pdf/2408.09869"  # document per local path or URL
 converter = DocumentConverter()
 result = converter.convert(source)
 print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"
-print(result.document.export_to_document_tokens())  # output: "<document><title><page_1><loc_20>..."
 ```
@@ -144,6 +145,6 @@ If you use Docling in your projects, please consider citing the following:
 ## License
-The Docling codebase is under MIT license.
+The Docling codebase is under MIT license.
 For individual model usage, please refer to the model licenses found in the original packages.

docling-2.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,44 @@
+docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
+docling/backend/asciidoc_backend.py,sha256=WW0eIanPIObcg5ci9YcnqFxwipmqRFsRY8zjZDdKvJA,14116
+docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
+docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
+docling/backend/html_backend.py,sha256=wfh5PWEwoqsCXxFCQbFBdJvEtlqZhXgqfPfTYETWHfE,14974
+docling/backend/md_backend.py,sha256=osYiNLnep9UgLq8mUH9bmwG3kP9RXxt69I8LlyeJN6g,11505
+docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
+docling/backend/msword_backend.py,sha256=6bY0ebOaeSbpskUJY5t5pOf4a2VclWzeHeSo-vzsaO0,17470
+docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
+docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
+docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
+docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/datamodel/base_models.py,sha256=Mx0xR6YmRP8thu8CjOxjbGHLUJctqIvFwRZQ-8tQowY,5380
+docling/datamodel/document.py,sha256=mkPXDms9jtPFY1pfBSicNaVRZwbbfzYFUj0dJDbMgG8,20612
+docling/datamodel/pipeline_options.py,sha256=WNjluKC-Ww63ifkGMHwws8zIDHnOS1z5Hw7_j3S0qao,2446
+docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
+docling/document_converter.py,sha256=T-Y2pWwbCIofW209XJ3wlc5TiGeQqMbDqgzcVWyZ_0Y,10227
+docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
+docling/models/base_ocr_model.py,sha256=SYelQRValiUo6M_p_9-J7CqNIOFO-EkK58j90SMsKQY,5028
+docling/models/ds_glm_model.py,sha256=vJLngchZonqFzGWbUr2izFSXk9DloPDhAfN2c3nkzNU,11254
+docling/models/easyocr_model.py,sha256=YfvdodjZ20WuOfouQXJmDyQL78QDOqWYsWSs2zSxWFc,3327
+docling/models/layout_model.py,sha256=zd2ULW3U6v9OJl4TnjWFEY6Q2O-lBfrIqtvrnDzF7HU,12596
+docling/models/page_assemble_model.py,sha256=LOKHho-r-RpeIVh8CpJ9tid_QIp5um3ukcrucZsyUlY,6645
+docling/models/page_preprocessing_model.py,sha256=cfhUIlGAGaX1RxILi69ZEV9Kmhhd3Y0XaSlQnGo18o4,1964
+docling/models/table_structure_model.py,sha256=YWSZKOz56gvicjTzVgSE-8Z_hI3NcRD5EN0yOUoM-_g,6979
+docling/models/tesseract_ocr_cli_model.py,sha256=fKc05V73ibMvAeuA4PForhYNtunpT5rR0k_xHZsew-E,5980
+docling/models/tesseract_ocr_model.py,sha256=v6td0vq8NogePuRTJRZhKF0DtZXITj70r9rKJKO5u9k,4984
+docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/pipeline/base_pipeline.py,sha256=7DTzVvM_jVHCxyY-BuuGRhmUsD_sgX4DD00oBFJWdB8,6723
+docling/pipeline/simple_pipeline.py,sha256=pxce0-3He5Lqa-xXT-7h173XVOSMZiMHl6HOfAJmQ7o,2162
+docling/pipeline/standard_pdf_pipeline.py,sha256=AVNSxGc6kPmBPDLWDc9eI8fryc25eOtiIVrOyVhZMZM,7527
+docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
+docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
+docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
+docling-2.2.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.2.0.dist-info/METADATA,sha256=TkaywA2l2ImdMc9WpUYWUQy3n50zG9Y9eC7ziElBlU0,6205
+docling-2.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-2.2.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
+docling-2.2.0.dist-info/RECORD,,

docling-2.0.0.dist-info/RECORD DELETED Viewed

@@ -1,42 +0,0 @@
-docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/backend/abstract_backend.py,sha256=8Lh1gf1P9AnzlwB989OVBgLmokTpfI0LxYRfuvYTqoo,1646
-docling/backend/docling_parse_backend.py,sha256=UgBpopZIP5YkhwhybiqDnqVsSqv9DAAPFkafhfL0pPo,7623
-docling/backend/docling_parse_v2_backend.py,sha256=VY7MsiyqjN3Vl0UkyezriiVJMLbLRrQVuKjWaTgIUwY,8336
-docling/backend/html_backend.py,sha256=MlhEXaA0tgX_tLuQLnkex43gsKqpqHWnbkssxY4n_kc,14753
-docling/backend/mspowerpoint_backend.py,sha256=2UYfMMeWwgDtvIKQELCA-bYv5Z-rGvbMiBNcidNL_uE,14332
-docling/backend/msword_backend.py,sha256=4SDqZAZxLr6VV50OU3MRBAV8SwZMCyJCUbNVMVUpitc,17659
-docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
-docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
-docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/cli/main.py,sha256=NRVGz0z-3EBwYNMJGVnLtDBcfOeutaUyYdkM0ymRnGA,8008
-docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/datamodel/base_models.py,sha256=Ha-DoRZoksjHSZHWqUSiQ79MTBEfY5ur8U_LVtyBRYU,5153
-docling/datamodel/document.py,sha256=GCARkUuv8TNtFO934E7KujOsTkBFqLXX5bogNprVXEM,19411
-docling/datamodel/pipeline_options.py,sha256=mez7CiJMtm-xhOmZ-2-M_Q3YwC6EzHytWfg0E3tiVio,2329
-docling/datamodel/settings.py,sha256=KBFVeQviR1hoCFjA1ZwuLuQ6EAAYR7saIa6EUYiOkHI,767
-docling/document_converter.py,sha256=S_t9hs2uZfXC38LC0hTaAihrSJIrCvnTiuY5SvUccgk,9587
-docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/models/base_model.py,sha256=wSBGAIAbLqrqP_SMtkzXMuyFvvzjVU6iCqgSNnGIR4Y,603
-docling/models/base_ocr_model.py,sha256=N5pOQ4RQSWPU-bPZ81FySDdBnwNG64-6K0ldK6ENU0U,4672
-docling/models/ds_glm_model.py,sha256=nUBHTsE-eRtrtPE6v_N4iZGr43bXIsOfb_8NFUMWJQk,11057
-docling/models/easyocr_model.py,sha256=URhHzxwnBuErf6sskWyEWauX-Kne0upnrAguzKQi3SI,3090
-docling/models/layout_model.py,sha256=B4Veff9V0WxcQXTBYzJM6rE7B3lszUI7zmg7EFE0WxU,12245
-docling/models/page_assemble_model.py,sha256=ovwSki52w1rlrc7MgMbjh1Uc5H8XBCz9S2nHE44mzYU,6030
-docling/models/page_preprocessing_model.py,sha256=PJ_jASz3w0Lus_Ep4NN5Vq_Redq7x8vAyVR8qXCb6Eg,1817
-docling/models/table_structure_model.py,sha256=qcjXXiNZcMWjr6ys02sToKZlAr8S0rAJNICbBjK9Ijo,6426
-docling/models/tesseract_ocr_cli_model.py,sha256=l-gRDU273opgack9fAxHaXPEdX5IdD5ZTnu6VsfKIWc,5665
-docling/models/tesseract_ocr_model.py,sha256=tEEq-URSYnyQru7RoD5fx-s1trwMxPCcwJx94M4iuxc,4676
-docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/pipeline/base_pipeline.py,sha256=7DTzVvM_jVHCxyY-BuuGRhmUsD_sgX4DD00oBFJWdB8,6723
-docling/pipeline/simple_pipeline.py,sha256=pxce0-3He5Lqa-xXT-7h173XVOSMZiMHl6HOfAJmQ7o,2162
-docling/pipeline/standard_pdf_pipeline.py,sha256=_gRGR9tsy55_tptFj-AiEJEedxhJ0iIjHb5qaj36d28,7506
-docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
-docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
-docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-2.0.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.0.0.dist-info/METADATA,sha256=RyawmIT2dz9la0DH8KsW749TNq4BpiSIndVEz83wauQ,6235
-docling-2.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-2.0.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
-docling-2.0.0.dist-info/RECORD,,

{docling-2.0.0.dist-info → docling-2.2.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-2.0.0.dist-info → docling-2.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.0.0.dist-info → docling-2.2.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

docling 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl