docling 2.7.0__py3-none-any.whl → 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/msword_backend.py +22 -9
- {docling-2.7.0.dist-info → docling-2.7.1.dist-info}/METADATA +5 -3
- {docling-2.7.0.dist-info → docling-2.7.1.dist-info}/RECORD +6 -6
- {docling-2.7.0.dist-info → docling-2.7.1.dist-info}/LICENSE +0 -0
- {docling-2.7.0.dist-info → docling-2.7.1.dist-info}/WHEEL +0 -0
- {docling-2.7.0.dist-info → docling-2.7.1.dist-info}/entry_points.txt +0 -0
@@ -14,7 +14,8 @@ from docling_core.types.doc import (
|
|
14
14
|
TableData,
|
15
15
|
)
|
16
16
|
from lxml import etree
|
17
|
-
from
|
17
|
+
from lxml.etree import XPath
|
18
|
+
from PIL import Image, UnidentifiedImageError
|
18
19
|
|
19
20
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
20
21
|
from docling.datamodel.base_models import InputFormat
|
@@ -132,8 +133,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
132
133
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
133
134
|
for element in body:
|
134
135
|
tag_name = etree.QName(element).localname
|
136
|
+
|
135
137
|
# Check for Inline Images (blip elements)
|
136
|
-
|
138
|
+
namespaces = {
|
139
|
+
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
140
|
+
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
141
|
+
}
|
142
|
+
xpath_expr = XPath(".//a:blip", namespaces=namespaces)
|
143
|
+
drawing_blip = xpath_expr(element)
|
137
144
|
|
138
145
|
# Check for Tables
|
139
146
|
if element.tag.endswith("tbl"):
|
@@ -210,7 +217,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
210
217
|
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
211
218
|
|
212
219
|
if paragraph.text is None:
|
213
|
-
# _log.warn(f"paragraph has text==None")
|
214
220
|
return
|
215
221
|
text = paragraph.text.strip()
|
216
222
|
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
@@ -502,10 +508,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
502
508
|
image_data = get_docx_image(element, drawing_blip)
|
503
509
|
image_bytes = BytesIO(image_data)
|
504
510
|
# Open the BytesIO object with PIL to create an Image
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
+
try:
|
512
|
+
pil_image = Image.open(image_bytes)
|
513
|
+
doc.add_picture(
|
514
|
+
parent=self.parents[self.level],
|
515
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
516
|
+
caption=None,
|
517
|
+
)
|
518
|
+
except (UnidentifiedImageError, OSError) as e:
|
519
|
+
_log.warning("Warning: image cannot be loaded by Pillow")
|
520
|
+
doc.add_picture(
|
521
|
+
parent=self.parents[self.level],
|
522
|
+
caption=None,
|
523
|
+
)
|
511
524
|
return
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.7.
|
3
|
+
Version: 2.7.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -31,12 +31,13 @@ Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
|
|
31
31
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
32
32
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
33
33
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
34
|
+
Requires-Dist: lxml (>=4.0.0,<6.0.0)
|
34
35
|
Requires-Dist: marko (>=2.1.2,<3.0.0)
|
35
36
|
Requires-Dist: ocrmac (>=1.0.0,<2.0.0) ; (sys_platform == "darwin") and (extra == "ocrmac")
|
36
37
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
37
38
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
38
39
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
39
|
-
Requires-Dist: pydantic (>=2.0.0,<
|
40
|
+
Requires-Dist: pydantic (>=2.0.0,<2.10)
|
40
41
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
41
42
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
42
43
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
@@ -71,12 +72,13 @@ Description-Content-Type: text/markdown
|
|
71
72
|
[](https://pydantic.dev)
|
72
73
|
[](https://github.com/pre-commit/pre-commit)
|
73
74
|
[](https://opensource.org/licenses/MIT)
|
75
|
+
[](https://pepy.tech/projects/docling)
|
74
76
|
|
75
77
|
Docling parses documents and exports them to the desired format with ease and speed.
|
76
78
|
|
77
79
|
## Features
|
78
80
|
|
79
|
-
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc
|
81
|
+
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
|
80
82
|
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
81
83
|
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
82
84
|
* 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
|
@@ -8,7 +8,7 @@ docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaod
|
|
8
8
|
docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
|
9
9
|
docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
|
10
10
|
docling/backend/mspowerpoint_backend.py,sha256=QD0NaatTO8U9CIFoiipkq3X5HxLZaaahH8nlrQ6ecDA,15710
|
11
|
-
docling/backend/msword_backend.py,sha256
|
11
|
+
docling/backend/msword_backend.py,sha256=sMumfB9Xa2Md1a8WO-fGPPAKf1s3mCvErMyZ-xnBC2E,18495
|
12
12
|
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
13
13
|
docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
|
14
14
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -40,8 +40,8 @@ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
|
40
40
|
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
41
41
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
42
42
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
43
|
-
docling-2.7.
|
44
|
-
docling-2.7.
|
45
|
-
docling-2.7.
|
46
|
-
docling-2.7.
|
47
|
-
docling-2.7.
|
43
|
+
docling-2.7.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
44
|
+
docling-2.7.1.dist-info/METADATA,sha256=TvD3BGlbO1ci54NzwmLxqSITXIdMefyj71YjdZkD7Vs,6906
|
45
|
+
docling-2.7.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
46
|
+
docling-2.7.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
47
|
+
docling-2.7.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|