PyPI - docling - Versions diffs - 2.14.0__tar.gz → 2.15.0__tar.gz - Mend

docling 2.14.0tar.gz → 2.15.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{docling-2.14.0 → docling-2.15.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.14.0
+Version: 2.15.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
-Requires-Dist: docling-core[chunking] (>=2.12.1,<3.0.0)
+Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
 Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
 Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)

{docling-2.14.0 → docling-2.15.0}/docling/backend/html_backend.py RENAMED Viewed

@@ -37,10 +37,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         try:
             if isinstance(self.path_or_stream, BytesIO):
-                text_stream = self.path_or_stream.getvalue().decode("utf-8")
+                text_stream = self.path_or_stream.getvalue()
                 self.soup = BeautifulSoup(text_stream, "html.parser")
             if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, "rb") as f:
                     html_content = f.read()
                     self.soup = BeautifulSoup(html_content, "html.parser")
         except Exception as e:

{docling-2.14.0 → docling-2.15.0}/docling/backend/mspowerpoint_backend.py RENAMED Viewed

@@ -16,7 +16,7 @@ from docling_core.types.doc import (
     TableCell,
     TableData,
 )
-from PIL import Image
+from PIL import Image, UnidentifiedImageError
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@@ -120,6 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         bullet_type = "None"
         list_text = ""
         list_label = GroupLabel.LIST
+        doc_label = DocItemLabel.LIST_ITEM
         prov = self.generate_prov(shape, slide_ind, shape.text.strip())
         # Identify if shape contains lists
@@ -276,16 +277,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         im_dpi, _ = image.dpi
         # Open it with PIL
-        pil_image = Image.open(BytesIO(image_bytes))
-        # shape has picture
-        prov = self.generate_prov(shape, slide_ind, "")
-        doc.add_picture(
-            parent=parent_slide,
-            image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
-            caption=None,
-            prov=prov,
-        )
+        try:
+            pil_image = Image.open(BytesIO(image_bytes))
+            # shape has picture
+            prov = self.generate_prov(shape, slide_ind, "")
+            doc.add_picture(
+                parent=parent_slide,
+                image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
+                caption=None,
+                prov=prov,
+            )
+        except (UnidentifiedImageError, OSError) as e:
+            _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
         return
     def handle_tables(self, shape, parent_slide, slide_ind, doc):

{docling-2.14.0 → docling-2.15.0}/docling/cli/main.py RENAMED Viewed

@@ -164,6 +164,11 @@ def convert(
     to_formats: List[OutputFormat] = typer.Option(
         None, "--to", help="Specify output formats. Defaults to Markdown."
     ),
+    headers: str = typer.Option(
+        None,
+        "--headers",
+        help="Specify http request headers used when fetching url input sources in the form of a JSON string",
+    ),
     image_export_mode: Annotated[
         ImageRefMode,
         typer.Option(
@@ -279,12 +284,19 @@ def convert(
     if from_formats is None:
         from_formats = [e for e in InputFormat]
+    parsed_headers: Optional[Dict[str, str]] = None
+    if headers is not None:
+        headers_t = TypeAdapter(Dict[str, str])
+        parsed_headers = headers_t.validate_json(headers)
     with tempfile.TemporaryDirectory() as tempdir:
         input_doc_paths: List[Path] = []
         for src in input_sources:
             try:
                 # check if we can fetch some remote url
-                source = resolve_source_to_path(source=src, workdir=Path(tempdir))
+                source = resolve_source_to_path(
+                    source=src, headers=parsed_headers, workdir=Path(tempdir)
+                )
                 input_doc_paths.append(source)
             except FileNotFoundError:
                 err_console.print(
@@ -390,7 +402,7 @@ def convert(
         start_time = time.time()
         conv_results = doc_converter.convert_all(
-            input_doc_paths, raises_on_error=abort_on_error
+            input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
         )
         output.mkdir(parents=True, exist_ok=True)

{docling-2.14.0 → docling-2.15.0}/docling/datamodel/document.py RENAMED Viewed

@@ -227,13 +227,18 @@ class _DummyBackend(AbstractDocumentBackend):
 class _DocumentConversionInput(BaseModel):
     path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
+    headers: Optional[Dict[str, str]] = None
     limits: Optional[DocumentLimits] = DocumentLimits()
     def docs(
         self, format_options: Dict[InputFormat, "FormatOption"]
     ) -> Iterable[InputDocument]:
         for item in self.path_or_stream_iterator:
-            obj = resolve_source_to_stream(item) if isinstance(item, str) else item
+            obj = (
+                resolve_source_to_stream(item, self.headers)
+                if isinstance(item, str)
+                else item
+            )
             format = self._guess_format(obj)
             backend: Type[AbstractDocumentBackend]
             if format not in format_options.keys():

{docling-2.14.0 → docling-2.15.0}/docling/document_converter.py RENAMED Viewed

@@ -176,6 +176,7 @@ class DocumentConverter:
     def convert(
         self,
         source: Union[Path, str, DocumentStream],  # TODO review naming
+        headers: Optional[Dict[str, str]] = None,
         raises_on_error: bool = True,
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,
@@ -185,6 +186,7 @@ class DocumentConverter:
             raises_on_error=raises_on_error,
             max_num_pages=max_num_pages,
             max_file_size=max_file_size,
+            headers=headers,
         )
         return next(all_res)
@@ -192,6 +194,7 @@ class DocumentConverter:
     def convert_all(
         self,
         source: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
+        headers: Optional[Dict[str, str]] = None,
         raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,
@@ -201,8 +204,7 @@ class DocumentConverter:
             max_file_size=max_file_size,
         )
         conv_input = _DocumentConversionInput(
-            path_or_stream_iterator=source,
-            limits=limits,
+            path_or_stream_iterator=source, limits=limits, headers=headers
         )
         conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)

{docling-2.14.0 → docling-2.15.0}/docling/models/base_ocr_model.py RENAMED Viewed

@@ -138,18 +138,31 @@ class BaseOcrModel(BasePageModel):
     def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
         image = copy.deepcopy(page.image)
+        scale_x = image.width / page.size.width
+        scale_y = image.height / page.size.height
         draw = ImageDraw.Draw(image, "RGBA")
         # Draw OCR rectangles as yellow filled rect
         for rect in ocr_rects:
             x0, y0, x1, y1 = rect.as_tuple()
+            y0 *= scale_x
+            y1 *= scale_y
+            x0 *= scale_x
+            x1 *= scale_x
             shade_color = (255, 255, 0, 40)  # transparent yellow
             draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
         # Draw OCR and programmatic cells
         for tc in page.cells:
             x0, y0, x1, y1 = tc.bbox.as_tuple()
-            color = "red"
+            y0 *= scale_x
+            y1 *= scale_y
+            x0 *= scale_x
+            x1 *= scale_x
+            color = "gray"
             if isinstance(tc, OcrCell):
                 color = "magenta"
             draw.rectangle([(x0, y0), (x1, y1)], outline=color)

{docling-2.14.0 → docling-2.15.0}/docling/models/layout_model.py RENAMED Viewed

@@ -67,29 +67,9 @@ class LayoutModel(BasePageModel):
         - Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
         Includes label names and confidence scores for each cluster.
         """
-        label_to_color = {
-            DocItemLabel.TEXT: (255, 255, 153),  # Light Yellow
-            DocItemLabel.CAPTION: (255, 204, 153),  # Light Orange
-            DocItemLabel.LIST_ITEM: (153, 153, 255),  # Light Purple
-            DocItemLabel.FORMULA: (192, 192, 192),  # Gray
-            DocItemLabel.TABLE: (255, 204, 204),  # Light Pink
-            DocItemLabel.PICTURE: (255, 204, 164),  # Light Beige
-            DocItemLabel.SECTION_HEADER: (255, 153, 153),  # Light Red
-            DocItemLabel.PAGE_HEADER: (204, 255, 204),  # Light Green
-            DocItemLabel.PAGE_FOOTER: (
-                204,
-                255,
-                204,
-            ),  # Light Green (same as Page-Header)
-            DocItemLabel.TITLE: (255, 153, 153),  # Light Red (same as Section-Header)
-            DocItemLabel.FOOTNOTE: (200, 200, 255),  # Light Blue
-            DocItemLabel.DOCUMENT_INDEX: (220, 220, 220),  # Light Gray
-            DocItemLabel.CODE: (125, 125, 125),  # Gray
-            DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193),  # Pale Green
-            DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193),  # Light Pink
-            DocItemLabel.FORM: (200, 255, 255),  # Light Cyan
-            DocItemLabel.KEY_VALUE_REGION: (183, 65, 14),  # Rusty orange
-        }
+        scale_x = page.image.width / page.size.width
+        scale_y = page.image.height / page.size.height
         # Filter clusters for left and right images
         exclude_labels = {
             DocItemLabel.FORM,
@@ -118,6 +98,11 @@ class LayoutModel(BasePageModel):
                     cell_color = (0, 0, 0, 40)  # Transparent black for cells
                     for tc in c.cells:
                         cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
+                        cx0 *= scale_x
+                        cx1 *= scale_x
+                        cy0 *= scale_x
+                        cy1 *= scale_y
                         draw.rectangle(
                             [(cx0, cy0), (cx1, cy1)],
                             outline=None,
@@ -125,8 +110,16 @@ class LayoutModel(BasePageModel):
                         )
                     # Draw cluster rectangle
                     x0, y0, x1, y1 = c.bbox.as_tuple()
-                    cluster_fill_color = (*list(label_to_color.get(c.label)), 70)
-                    cluster_outline_color = (*list(label_to_color.get(c.label)), 255)
+                    x0 *= scale_x
+                    x1 *= scale_x
+                    y0 *= scale_x
+                    y1 *= scale_y
+                    cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
+                    cluster_outline_color = (
+                        *list(DocItemLabel.get_color(c.label)),
+                        255,
+                    )
                     draw.rectangle(
                         [(x0, y0), (x1, y1)],
                         outline=cluster_outline_color,

{docling-2.14.0 → docling-2.15.0}/docling/models/table_structure_model.py RENAMED Viewed

@@ -66,23 +66,43 @@ class TableStructureModel(BasePageModel):
         show: bool = False,
     ):
         assert page._backend is not None
+        assert page.size is not None
         image = (
             page._backend.get_page_image()
         )  # make new image to avoid drawing on the saved ones
+        scale_x = image.width / page.size.width
+        scale_y = image.height / page.size.height
         draw = ImageDraw.Draw(image)
         for table_element in tbl_list:
             x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
+            y0 *= scale_x
+            y1 *= scale_y
+            x0 *= scale_x
+            x1 *= scale_x
             draw.rectangle([(x0, y0), (x1, y1)], outline="red")
             for cell in table_element.cluster.cells:
                 x0, y0, x1, y1 = cell.bbox.as_tuple()
+                x0 *= scale_x
+                x1 *= scale_x
+                y0 *= scale_x
+                y1 *= scale_y
                 draw.rectangle([(x0, y0), (x1, y1)], outline="green")
             for tc in table_element.table_cells:
                 if tc.bbox is not None:
                     x0, y0, x1, y1 = tc.bbox.as_tuple()
+                    x0 *= scale_x
+                    x1 *= scale_x
+                    y0 *= scale_x
+                    y1 *= scale_y
                     if tc.column_header:
                         width = 3
                     else:

{docling-2.14.0 → docling-2.15.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.14.0"  # DO NOT EDIT, updated automatically
+version = "2.15.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -25,7 +25,7 @@ packages = [{include = "docling"}]
 # actual dependencies:
 ######################
 python = "^3.9"
-docling-core = { version = "^2.12.1", extras = ["chunking"] }
+docling-core = { version = "^2.13.1", extras = ["chunking"] }
 pydantic = "^2.0.0"
 docling-ibm-models = "^3.1.0"
 deepsearch-glm = "^1.0.0"

{docling-2.14.0 → docling-2.15.0}/LICENSE RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/README.md RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/__init__.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/__init__.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/abstract_backend.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/asciidoc_backend.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/md_backend.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/msexcel_backend.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/msword_backend.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/pdf_backend.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/xml/__init__.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/xml/pubmed_backend.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/backend/xml/uspto_backend.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/chunking/__init__.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/cli/__init__.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/datamodel/__init__.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/datamodel/base_models.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/datamodel/settings.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/exceptions.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/models/__init__.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/models/base_model.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/models/ds_glm_model.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/models/easyocr_model.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/models/ocr_mac_model.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/models/page_assemble_model.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/models/page_preprocessing_model.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/models/rapid_ocr_model.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/models/tesseract_ocr_cli_model.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/models/tesseract_ocr_model.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/pipeline/__init__.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/pipeline/base_pipeline.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/pipeline/simple_pipeline.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/pipeline/standard_pdf_pipeline.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/py.typed RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/utils/__init__.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/utils/accelerator_utils.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/utils/export.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/utils/glm_utils.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/utils/layout_postprocessor.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/utils/profiling.py RENAMED Viewed

File without changes

{docling-2.14.0 → docling-2.15.0}/docling/utils/utils.py RENAMED Viewed

File without changes

docling 2.14.0__tar.gz → 2.15.0__tar.gz

docling 2.14.0tar.gz → 2.15.0tar.gz