docling 2.7.0__py3-none-any.whl → 2.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,7 +14,8 @@ from docling_core.types.doc import (
14
14
  TableData,
15
15
  )
16
16
  from lxml import etree
17
- from PIL import Image
17
+ from lxml.etree import XPath
18
+ from PIL import Image, UnidentifiedImageError
18
19
 
19
20
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
20
21
  from docling.datamodel.base_models import InputFormat
@@ -132,8 +133,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
132
133
  def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
133
134
  for element in body:
134
135
  tag_name = etree.QName(element).localname
136
+
135
137
  # Check for Inline Images (blip elements)
136
- drawing_blip = element.xpath(".//a:blip")
138
+ namespaces = {
139
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
140
+ "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
141
+ }
142
+ xpath_expr = XPath(".//a:blip", namespaces=namespaces)
143
+ drawing_blip = xpath_expr(element)
137
144
 
138
145
  # Check for Tables
139
146
  if element.tag.endswith("tbl"):
@@ -210,7 +217,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
210
217
  paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
211
218
 
212
219
  if paragraph.text is None:
213
- # _log.warn(f"paragraph has text==None")
214
220
  return
215
221
  text = paragraph.text.strip()
216
222
  # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
@@ -502,10 +508,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
502
508
  image_data = get_docx_image(element, drawing_blip)
503
509
  image_bytes = BytesIO(image_data)
504
510
  # Open the BytesIO object with PIL to create an Image
505
- pil_image = Image.open(image_bytes)
506
- doc.add_picture(
507
- parent=self.parents[self.level],
508
- image=ImageRef.from_pil(image=pil_image, dpi=72),
509
- caption=None,
510
- )
511
+ try:
512
+ pil_image = Image.open(image_bytes)
513
+ doc.add_picture(
514
+ parent=self.parents[self.level],
515
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
516
+ caption=None,
517
+ )
518
+ except (UnidentifiedImageError, OSError) as e:
519
+ _log.warning("Warning: image cannot be loaded by Pillow")
520
+ doc.add_picture(
521
+ parent=self.parents[self.level],
522
+ caption=None,
523
+ )
511
524
  return
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.7.0
3
+ Version: 2.7.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -31,12 +31,13 @@ Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
31
31
  Requires-Dist: easyocr (>=1.7,<2.0)
32
32
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
33
33
  Requires-Dist: huggingface_hub (>=0.23,<1)
34
+ Requires-Dist: lxml (>=4.0.0,<6.0.0)
34
35
  Requires-Dist: marko (>=2.1.2,<3.0.0)
35
36
  Requires-Dist: ocrmac (>=1.0.0,<2.0.0) ; (sys_platform == "darwin") and (extra == "ocrmac")
36
37
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
37
38
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
38
39
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
39
- Requires-Dist: pydantic (>=2.0.0,<3.0.0)
40
+ Requires-Dist: pydantic (>=2.0.0,<2.10)
40
41
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
41
42
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
42
43
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
@@ -71,12 +72,13 @@ Description-Content-Type: text/markdown
71
72
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
72
73
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
73
74
  [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
75
+ [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
74
76
 
75
77
  Docling parses documents and exports them to the desired format with ease and speed.
76
78
 
77
79
  ## Features
78
80
 
79
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
81
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
80
82
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
81
83
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
82
84
  * 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
@@ -8,7 +8,7 @@ docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaod
8
8
  docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
9
  docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
10
10
  docling/backend/mspowerpoint_backend.py,sha256=QD0NaatTO8U9CIFoiipkq3X5HxLZaaahH8nlrQ6ecDA,15710
11
- docling/backend/msword_backend.py,sha256=-cCEh4EhdGknHrxiVGFE4GDo_iYpAqP2QxRaeqrJHUE,17939
11
+ docling/backend/msword_backend.py,sha256=sMumfB9Xa2Md1a8WO-fGPPAKf1s3mCvErMyZ-xnBC2E,18495
12
12
  docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
13
13
  docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
14
14
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -40,8 +40,8 @@ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
40
40
  docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
41
41
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
42
42
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
43
- docling-2.7.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
44
- docling-2.7.0.dist-info/METADATA,sha256=6cpEQMbjK1tKCQ3kkzeOD7URm41HPx2xUSs-gxvlsM4,6761
45
- docling-2.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
46
- docling-2.7.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
47
- docling-2.7.0.dist-info/RECORD,,
43
+ docling-2.7.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
44
+ docling-2.7.1.dist-info/METADATA,sha256=TvD3BGlbO1ci54NzwmLxqSITXIdMefyj71YjdZkD7Vs,6906
45
+ docling-2.7.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
46
+ docling-2.7.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
47
+ docling-2.7.1.dist-info/RECORD,,