docling 2.7.0__tar.gz → 2.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {docling-2.7.0 → docling-2.7.1}/PKG-INFO +5 -3
  2. {docling-2.7.0 → docling-2.7.1}/README.md +2 -1
  3. {docling-2.7.0 → docling-2.7.1}/docling/backend/msword_backend.py +22 -9
  4. {docling-2.7.0 → docling-2.7.1}/pyproject.toml +3 -2
  5. {docling-2.7.0 → docling-2.7.1}/LICENSE +0 -0
  6. {docling-2.7.0 → docling-2.7.1}/docling/__init__.py +0 -0
  7. {docling-2.7.0 → docling-2.7.1}/docling/backend/__init__.py +0 -0
  8. {docling-2.7.0 → docling-2.7.1}/docling/backend/abstract_backend.py +0 -0
  9. {docling-2.7.0 → docling-2.7.1}/docling/backend/asciidoc_backend.py +0 -0
  10. {docling-2.7.0 → docling-2.7.1}/docling/backend/docling_parse_backend.py +0 -0
  11. {docling-2.7.0 → docling-2.7.1}/docling/backend/docling_parse_v2_backend.py +0 -0
  12. {docling-2.7.0 → docling-2.7.1}/docling/backend/html_backend.py +0 -0
  13. {docling-2.7.0 → docling-2.7.1}/docling/backend/md_backend.py +0 -0
  14. {docling-2.7.0 → docling-2.7.1}/docling/backend/msexcel_backend.py +0 -0
  15. {docling-2.7.0 → docling-2.7.1}/docling/backend/mspowerpoint_backend.py +0 -0
  16. {docling-2.7.0 → docling-2.7.1}/docling/backend/pdf_backend.py +0 -0
  17. {docling-2.7.0 → docling-2.7.1}/docling/backend/pypdfium2_backend.py +0 -0
  18. {docling-2.7.0 → docling-2.7.1}/docling/cli/__init__.py +0 -0
  19. {docling-2.7.0 → docling-2.7.1}/docling/cli/main.py +0 -0
  20. {docling-2.7.0 → docling-2.7.1}/docling/datamodel/__init__.py +0 -0
  21. {docling-2.7.0 → docling-2.7.1}/docling/datamodel/base_models.py +0 -0
  22. {docling-2.7.0 → docling-2.7.1}/docling/datamodel/document.py +0 -0
  23. {docling-2.7.0 → docling-2.7.1}/docling/datamodel/pipeline_options.py +0 -0
  24. {docling-2.7.0 → docling-2.7.1}/docling/datamodel/settings.py +0 -0
  25. {docling-2.7.0 → docling-2.7.1}/docling/document_converter.py +0 -0
  26. {docling-2.7.0 → docling-2.7.1}/docling/models/__init__.py +0 -0
  27. {docling-2.7.0 → docling-2.7.1}/docling/models/base_model.py +0 -0
  28. {docling-2.7.0 → docling-2.7.1}/docling/models/base_ocr_model.py +0 -0
  29. {docling-2.7.0 → docling-2.7.1}/docling/models/ds_glm_model.py +0 -0
  30. {docling-2.7.0 → docling-2.7.1}/docling/models/easyocr_model.py +0 -0
  31. {docling-2.7.0 → docling-2.7.1}/docling/models/layout_model.py +0 -0
  32. {docling-2.7.0 → docling-2.7.1}/docling/models/ocr_mac_model.py +0 -0
  33. {docling-2.7.0 → docling-2.7.1}/docling/models/page_assemble_model.py +0 -0
  34. {docling-2.7.0 → docling-2.7.1}/docling/models/page_preprocessing_model.py +0 -0
  35. {docling-2.7.0 → docling-2.7.1}/docling/models/table_structure_model.py +0 -0
  36. {docling-2.7.0 → docling-2.7.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
  37. {docling-2.7.0 → docling-2.7.1}/docling/models/tesseract_ocr_model.py +0 -0
  38. {docling-2.7.0 → docling-2.7.1}/docling/pipeline/__init__.py +0 -0
  39. {docling-2.7.0 → docling-2.7.1}/docling/pipeline/base_pipeline.py +0 -0
  40. {docling-2.7.0 → docling-2.7.1}/docling/pipeline/simple_pipeline.py +0 -0
  41. {docling-2.7.0 → docling-2.7.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  42. {docling-2.7.0 → docling-2.7.1}/docling/utils/__init__.py +0 -0
  43. {docling-2.7.0 → docling-2.7.1}/docling/utils/export.py +0 -0
  44. {docling-2.7.0 → docling-2.7.1}/docling/utils/layout_utils.py +0 -0
  45. {docling-2.7.0 → docling-2.7.1}/docling/utils/profiling.py +0 -0
  46. {docling-2.7.0 → docling-2.7.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.7.0
3
+ Version: 2.7.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -31,12 +31,13 @@ Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
31
31
  Requires-Dist: easyocr (>=1.7,<2.0)
32
32
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
33
33
  Requires-Dist: huggingface_hub (>=0.23,<1)
34
+ Requires-Dist: lxml (>=4.0.0,<6.0.0)
34
35
  Requires-Dist: marko (>=2.1.2,<3.0.0)
35
36
  Requires-Dist: ocrmac (>=1.0.0,<2.0.0) ; (sys_platform == "darwin") and (extra == "ocrmac")
36
37
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
37
38
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
38
39
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
39
- Requires-Dist: pydantic (>=2.0.0,<3.0.0)
40
+ Requires-Dist: pydantic (>=2.0.0,<2.10)
40
41
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
41
42
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
42
43
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
@@ -71,12 +72,13 @@ Description-Content-Type: text/markdown
71
72
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
72
73
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
73
74
  [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
75
+ [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
74
76
 
75
77
  Docling parses documents and exports them to the desired format with ease and speed.
76
78
 
77
79
  ## Features
78
80
 
79
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
81
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
80
82
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
81
83
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
82
84
  * 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
@@ -20,12 +20,13 @@
20
20
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
21
21
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
22
22
  [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
23
+ [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
23
24
 
24
25
  Docling parses documents and exports them to the desired format with ease and speed.
25
26
 
26
27
  ## Features
27
28
 
28
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
29
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
29
30
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
30
31
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
31
32
  * 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
@@ -14,7 +14,8 @@ from docling_core.types.doc import (
14
14
  TableData,
15
15
  )
16
16
  from lxml import etree
17
- from PIL import Image
17
+ from lxml.etree import XPath
18
+ from PIL import Image, UnidentifiedImageError
18
19
 
19
20
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
20
21
  from docling.datamodel.base_models import InputFormat
@@ -132,8 +133,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
132
133
  def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
133
134
  for element in body:
134
135
  tag_name = etree.QName(element).localname
136
+
135
137
  # Check for Inline Images (blip elements)
136
- drawing_blip = element.xpath(".//a:blip")
138
+ namespaces = {
139
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
140
+ "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
141
+ }
142
+ xpath_expr = XPath(".//a:blip", namespaces=namespaces)
143
+ drawing_blip = xpath_expr(element)
137
144
 
138
145
  # Check for Tables
139
146
  if element.tag.endswith("tbl"):
@@ -210,7 +217,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
210
217
  paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
211
218
 
212
219
  if paragraph.text is None:
213
- # _log.warn(f"paragraph has text==None")
214
220
  return
215
221
  text = paragraph.text.strip()
216
222
  # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
@@ -502,10 +508,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
502
508
  image_data = get_docx_image(element, drawing_blip)
503
509
  image_bytes = BytesIO(image_data)
504
510
  # Open the BytesIO object with PIL to create an Image
505
- pil_image = Image.open(image_bytes)
506
- doc.add_picture(
507
- parent=self.parents[self.level],
508
- image=ImageRef.from_pil(image=pil_image, dpi=72),
509
- caption=None,
510
- )
511
+ try:
512
+ pil_image = Image.open(image_bytes)
513
+ doc.add_picture(
514
+ parent=self.parents[self.level],
515
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
516
+ caption=None,
517
+ )
518
+ except (UnidentifiedImageError, OSError) as e:
519
+ _log.warning("Warning: image cannot be loaded by Pillow")
520
+ doc.add_picture(
521
+ parent=self.parents[self.level],
522
+ caption=None,
523
+ )
511
524
  return
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.7.0" # DO NOT EDIT, updated automatically
3
+ version = "2.7.1" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -25,7 +25,7 @@ packages = [{include = "docling"}]
25
25
  # actual dependencies:
26
26
  ######################
27
27
  python = "^3.9"
28
- pydantic = "^2.0.0"
28
+ pydantic = ">=2.0.0,<2.10"
29
29
  docling-core = "^2.4.0"
30
30
  docling-ibm-models = "^2.0.6"
31
31
  deepsearch-glm = "^0.26.1"
@@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3"
48
48
  pandas = "^2.1.4"
49
49
  marko = "^2.1.2"
50
50
  openpyxl = "^3.1.5"
51
+ lxml = ">=4.0.0,<6.0.0"
51
52
  ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
52
53
 
53
54
  [tool.poetry.group.dev.dependencies]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes