docling 2.57.0__tar.gz → 2.58.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (157) hide show
  1. {docling-2.57.0 → docling-2.58.0}/PKG-INFO +4 -2
  2. {docling-2.57.0 → docling-2.58.0}/README.md +1 -0
  3. {docling-2.57.0 → docling-2.58.0}/docling/backend/abstract_backend.py +24 -3
  4. {docling-2.57.0 → docling-2.58.0}/docling/backend/asciidoc_backend.py +3 -3
  5. {docling-2.57.0 → docling-2.58.0}/docling/backend/docling_parse_v4_backend.py +15 -4
  6. {docling-2.57.0 → docling-2.58.0}/docling/backend/html_backend.py +130 -20
  7. {docling-2.57.0 → docling-2.58.0}/docling/backend/md_backend.py +27 -5
  8. {docling-2.57.0 → docling-2.58.0}/docling/backend/msexcel_backend.py +115 -27
  9. {docling-2.57.0 → docling-2.58.0}/docling/backend/mspowerpoint_backend.py +2 -2
  10. {docling-2.57.0 → docling-2.58.0}/docling/backend/msword_backend.py +18 -18
  11. {docling-2.57.0 → docling-2.58.0}/docling/backend/pdf_backend.py +9 -2
  12. {docling-2.57.0 → docling-2.58.0}/docling/backend/pypdfium2_backend.py +12 -3
  13. {docling-2.57.0 → docling-2.58.0}/docling/cli/main.py +85 -30
  14. docling-2.58.0/docling/datamodel/asr_model_specs.py +494 -0
  15. docling-2.58.0/docling/datamodel/backend_options.py +82 -0
  16. {docling-2.57.0 → docling-2.58.0}/docling/datamodel/base_models.py +17 -2
  17. {docling-2.57.0 → docling-2.58.0}/docling/datamodel/document.py +81 -48
  18. {docling-2.57.0 → docling-2.58.0}/docling/datamodel/pipeline_options_asr_model.py +21 -1
  19. {docling-2.57.0 → docling-2.58.0}/docling/document_converter.py +37 -45
  20. {docling-2.57.0 → docling-2.58.0}/docling/document_extractor.py +12 -11
  21. {docling-2.57.0 → docling-2.58.0}/docling/models/readingorder_model.py +6 -7
  22. {docling-2.57.0 → docling-2.58.0}/docling/pipeline/asr_pipeline.py +139 -3
  23. {docling-2.57.0 → docling-2.58.0}/docling/utils/api_image_request.py +4 -4
  24. {docling-2.57.0 → docling-2.58.0}/docling/utils/layout_postprocessor.py +23 -24
  25. {docling-2.57.0 → docling-2.58.0}/docling.egg-info/PKG-INFO +4 -2
  26. {docling-2.57.0 → docling-2.58.0}/docling.egg-info/SOURCES.txt +3 -0
  27. {docling-2.57.0 → docling-2.58.0}/docling.egg-info/requires.txt +4 -1
  28. {docling-2.57.0 → docling-2.58.0}/pyproject.toml +3 -2
  29. docling-2.58.0/tests/test_asr_mlx_whisper.py +340 -0
  30. docling-2.58.0/tests/test_asr_pipeline.py +398 -0
  31. docling-2.58.0/tests/test_backend_html.py +443 -0
  32. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_markdown.py +1 -2
  33. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_msexcel.py +65 -0
  34. docling-2.58.0/tests/test_cli.py +92 -0
  35. {docling-2.57.0 → docling-2.58.0}/tests/test_input_doc.py +42 -0
  36. docling-2.58.0/tests/test_interfaces.py +138 -0
  37. docling-2.58.0/tests/test_pdf_password.py +63 -0
  38. docling-2.57.0/docling/datamodel/asr_model_specs.py +0 -92
  39. docling-2.57.0/tests/test_asr_pipeline.py +0 -85
  40. docling-2.57.0/tests/test_backend_html.py +0 -213
  41. docling-2.57.0/tests/test_cli.py +0 -27
  42. docling-2.57.0/tests/test_interfaces.py +0 -63
  43. {docling-2.57.0 → docling-2.58.0}/LICENSE +0 -0
  44. {docling-2.57.0 → docling-2.58.0}/docling/__init__.py +0 -0
  45. {docling-2.57.0 → docling-2.58.0}/docling/backend/__init__.py +0 -0
  46. {docling-2.57.0 → docling-2.58.0}/docling/backend/csv_backend.py +0 -0
  47. {docling-2.57.0 → docling-2.58.0}/docling/backend/docling_parse_backend.py +0 -0
  48. {docling-2.57.0 → docling-2.58.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  49. {docling-2.57.0 → docling-2.58.0}/docling/backend/docx/__init__.py +0 -0
  50. {docling-2.57.0 → docling-2.58.0}/docling/backend/docx/drawingml/utils.py +0 -0
  51. {docling-2.57.0 → docling-2.58.0}/docling/backend/docx/latex/__init__.py +0 -0
  52. {docling-2.57.0 → docling-2.58.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  53. {docling-2.57.0 → docling-2.58.0}/docling/backend/docx/latex/omml.py +0 -0
  54. {docling-2.57.0 → docling-2.58.0}/docling/backend/json/__init__.py +0 -0
  55. {docling-2.57.0 → docling-2.58.0}/docling/backend/json/docling_json_backend.py +0 -0
  56. {docling-2.57.0 → docling-2.58.0}/docling/backend/mets_gbs_backend.py +0 -0
  57. {docling-2.57.0 → docling-2.58.0}/docling/backend/noop_backend.py +0 -0
  58. {docling-2.57.0 → docling-2.58.0}/docling/backend/webvtt_backend.py +0 -0
  59. {docling-2.57.0 → docling-2.58.0}/docling/backend/xml/__init__.py +0 -0
  60. {docling-2.57.0 → docling-2.58.0}/docling/backend/xml/jats_backend.py +0 -0
  61. {docling-2.57.0 → docling-2.58.0}/docling/backend/xml/uspto_backend.py +0 -0
  62. {docling-2.57.0 → docling-2.58.0}/docling/chunking/__init__.py +0 -0
  63. {docling-2.57.0 → docling-2.58.0}/docling/cli/__init__.py +0 -0
  64. {docling-2.57.0 → docling-2.58.0}/docling/cli/models.py +0 -0
  65. {docling-2.57.0 → docling-2.58.0}/docling/cli/tools.py +0 -0
  66. {docling-2.57.0 → docling-2.58.0}/docling/datamodel/__init__.py +0 -0
  67. {docling-2.57.0 → docling-2.58.0}/docling/datamodel/accelerator_options.py +0 -0
  68. {docling-2.57.0 → docling-2.58.0}/docling/datamodel/extraction.py +0 -0
  69. {docling-2.57.0 → docling-2.58.0}/docling/datamodel/layout_model_specs.py +0 -0
  70. {docling-2.57.0 → docling-2.58.0}/docling/datamodel/pipeline_options.py +0 -0
  71. {docling-2.57.0 → docling-2.58.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  72. {docling-2.57.0 → docling-2.58.0}/docling/datamodel/settings.py +0 -0
  73. {docling-2.57.0 → docling-2.58.0}/docling/datamodel/vlm_model_specs.py +0 -0
  74. {docling-2.57.0 → docling-2.58.0}/docling/exceptions.py +0 -0
  75. {docling-2.57.0 → docling-2.58.0}/docling/models/__init__.py +0 -0
  76. {docling-2.57.0 → docling-2.58.0}/docling/models/api_vlm_model.py +0 -0
  77. {docling-2.57.0 → docling-2.58.0}/docling/models/auto_ocr_model.py +0 -0
  78. {docling-2.57.0 → docling-2.58.0}/docling/models/base_model.py +0 -0
  79. {docling-2.57.0 → docling-2.58.0}/docling/models/base_ocr_model.py +0 -0
  80. {docling-2.57.0 → docling-2.58.0}/docling/models/code_formula_model.py +0 -0
  81. {docling-2.57.0 → docling-2.58.0}/docling/models/document_picture_classifier.py +0 -0
  82. {docling-2.57.0 → docling-2.58.0}/docling/models/easyocr_model.py +0 -0
  83. {docling-2.57.0 → docling-2.58.0}/docling/models/factories/__init__.py +0 -0
  84. {docling-2.57.0 → docling-2.58.0}/docling/models/factories/base_factory.py +0 -0
  85. {docling-2.57.0 → docling-2.58.0}/docling/models/factories/ocr_factory.py +0 -0
  86. {docling-2.57.0 → docling-2.58.0}/docling/models/factories/picture_description_factory.py +0 -0
  87. {docling-2.57.0 → docling-2.58.0}/docling/models/layout_model.py +0 -0
  88. {docling-2.57.0 → docling-2.58.0}/docling/models/ocr_mac_model.py +0 -0
  89. {docling-2.57.0 → docling-2.58.0}/docling/models/page_assemble_model.py +0 -0
  90. {docling-2.57.0 → docling-2.58.0}/docling/models/page_preprocessing_model.py +0 -0
  91. {docling-2.57.0 → docling-2.58.0}/docling/models/picture_description_api_model.py +0 -0
  92. {docling-2.57.0 → docling-2.58.0}/docling/models/picture_description_base_model.py +0 -0
  93. {docling-2.57.0 → docling-2.58.0}/docling/models/picture_description_vlm_model.py +0 -0
  94. {docling-2.57.0 → docling-2.58.0}/docling/models/plugins/__init__.py +0 -0
  95. {docling-2.57.0 → docling-2.58.0}/docling/models/plugins/defaults.py +0 -0
  96. {docling-2.57.0 → docling-2.58.0}/docling/models/rapid_ocr_model.py +0 -0
  97. {docling-2.57.0 → docling-2.58.0}/docling/models/table_structure_model.py +0 -0
  98. {docling-2.57.0 → docling-2.58.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  99. {docling-2.57.0 → docling-2.58.0}/docling/models/tesseract_ocr_model.py +0 -0
  100. {docling-2.57.0 → docling-2.58.0}/docling/models/utils/__init__.py +0 -0
  101. {docling-2.57.0 → docling-2.58.0}/docling/models/utils/generation_utils.py +0 -0
  102. {docling-2.57.0 → docling-2.58.0}/docling/models/utils/hf_model_download.py +0 -0
  103. {docling-2.57.0 → docling-2.58.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  104. {docling-2.57.0 → docling-2.58.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  105. {docling-2.57.0 → docling-2.58.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  106. {docling-2.57.0 → docling-2.58.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +0 -0
  107. {docling-2.57.0 → docling-2.58.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
  108. {docling-2.57.0 → docling-2.58.0}/docling/pipeline/__init__.py +0 -0
  109. {docling-2.57.0 → docling-2.58.0}/docling/pipeline/base_extraction_pipeline.py +0 -0
  110. {docling-2.57.0 → docling-2.58.0}/docling/pipeline/base_pipeline.py +0 -0
  111. {docling-2.57.0 → docling-2.58.0}/docling/pipeline/extraction_vlm_pipeline.py +0 -0
  112. {docling-2.57.0 → docling-2.58.0}/docling/pipeline/simple_pipeline.py +0 -0
  113. {docling-2.57.0 → docling-2.58.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  114. {docling-2.57.0 → docling-2.58.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
  115. {docling-2.57.0 → docling-2.58.0}/docling/pipeline/vlm_pipeline.py +0 -0
  116. {docling-2.57.0 → docling-2.58.0}/docling/py.typed +0 -0
  117. {docling-2.57.0 → docling-2.58.0}/docling/utils/__init__.py +0 -0
  118. {docling-2.57.0 → docling-2.58.0}/docling/utils/accelerator_utils.py +0 -0
  119. {docling-2.57.0 → docling-2.58.0}/docling/utils/export.py +0 -0
  120. {docling-2.57.0 → docling-2.58.0}/docling/utils/glm_utils.py +0 -0
  121. {docling-2.57.0 → docling-2.58.0}/docling/utils/locks.py +0 -0
  122. {docling-2.57.0 → docling-2.58.0}/docling/utils/model_downloader.py +0 -0
  123. {docling-2.57.0 → docling-2.58.0}/docling/utils/ocr_utils.py +0 -0
  124. {docling-2.57.0 → docling-2.58.0}/docling/utils/orientation.py +0 -0
  125. {docling-2.57.0 → docling-2.58.0}/docling/utils/profiling.py +0 -0
  126. {docling-2.57.0 → docling-2.58.0}/docling/utils/utils.py +0 -0
  127. {docling-2.57.0 → docling-2.58.0}/docling/utils/visualization.py +0 -0
  128. {docling-2.57.0 → docling-2.58.0}/docling.egg-info/dependency_links.txt +0 -0
  129. {docling-2.57.0 → docling-2.58.0}/docling.egg-info/entry_points.txt +0 -0
  130. {docling-2.57.0 → docling-2.58.0}/docling.egg-info/top_level.txt +0 -0
  131. {docling-2.57.0 → docling-2.58.0}/setup.cfg +0 -0
  132. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_asciidoc.py +0 -0
  133. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_csv.py +0 -0
  134. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_docling_json.py +0 -0
  135. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_docling_parse.py +0 -0
  136. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_docling_parse_v2.py +0 -0
  137. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_docling_parse_v4.py +0 -0
  138. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_jats.py +0 -0
  139. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_mets_gbs.py +0 -0
  140. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_msword.py +0 -0
  141. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_patent_uspto.py +0 -0
  142. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_pdfium.py +0 -0
  143. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_pptx.py +0 -0
  144. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_vtt.py +0 -0
  145. {docling-2.57.0 → docling-2.58.0}/tests/test_backend_webp.py +0 -0
  146. {docling-2.57.0 → docling-2.58.0}/tests/test_code_formula.py +0 -0
  147. {docling-2.57.0 → docling-2.58.0}/tests/test_data_gen_flag.py +0 -0
  148. {docling-2.57.0 → docling-2.58.0}/tests/test_document_picture_classifier.py +0 -0
  149. {docling-2.57.0 → docling-2.58.0}/tests/test_e2e_conversion.py +0 -0
  150. {docling-2.57.0 → docling-2.58.0}/tests/test_e2e_ocr_conversion.py +0 -0
  151. {docling-2.57.0 → docling-2.58.0}/tests/test_extraction.py +0 -0
  152. {docling-2.57.0 → docling-2.58.0}/tests/test_invalid_input.py +0 -0
  153. {docling-2.57.0 → docling-2.58.0}/tests/test_legacy_format_transform.py +0 -0
  154. {docling-2.57.0 → docling-2.58.0}/tests/test_ocr_utils.py +0 -0
  155. {docling-2.57.0 → docling-2.58.0}/tests/test_options.py +0 -0
  156. {docling-2.57.0 → docling-2.58.0}/tests/test_settings_load.py +0 -0
  157. {docling-2.57.0 → docling-2.58.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.57.0
3
+ Version: 2.58.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.2
30
- Requires-Dist: docling-parse<5.0.0,>=4.4.0
30
+ Requires-Dist: docling-parse<5.0.0,>=4.7.0
31
31
  Requires-Dist: docling-ibm-models<4,>=3.9.1
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
@@ -69,6 +69,7 @@ Provides-Extra: rapidocr
69
69
  Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
70
70
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
71
71
  Provides-Extra: asr
72
+ Requires-Dist: mlx-whisper>=0.4.3; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "asr"
72
73
  Requires-Dist: openai-whisper>=20250625; extra == "asr"
73
74
  Dynamic: license-file
74
75
 
@@ -96,6 +97,7 @@ Dynamic: license-file
96
97
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
97
98
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
98
99
  [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
100
+ [![Discord](https://img.shields.io/discord/1399788921306746971?color=6A7EC2&logo=discord&logoColor=ffffff)](https://docling.ai/discord)
99
101
  [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
100
102
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
101
103
 
@@ -22,6 +22,7 @@
22
22
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
23
23
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
24
24
  [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
25
+ [![Discord](https://img.shields.io/discord/1399788921306746971?color=6A7EC2&logo=discord&logoColor=ffffff)](https://docling.ai/discord)
25
26
  [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
26
27
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
27
28
 
@@ -1,10 +1,16 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Set, Union
4
+ from typing import TYPE_CHECKING, Union
5
5
 
6
6
  from docling_core.types.doc import DoclingDocument
7
7
 
8
+ from docling.datamodel.backend_options import (
9
+ BackendOptions,
10
+ BaseBackendOptions,
11
+ DeclarativeBackendOptions,
12
+ )
13
+
8
14
  if TYPE_CHECKING:
9
15
  from docling.datamodel.base_models import InputFormat
10
16
  from docling.datamodel.document import InputDocument
@@ -12,11 +18,17 @@ if TYPE_CHECKING:
12
18
 
13
19
  class AbstractDocumentBackend(ABC):
14
20
  @abstractmethod
15
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
21
+ def __init__(
22
+ self,
23
+ in_doc: "InputDocument",
24
+ path_or_stream: Union[BytesIO, Path],
25
+ options: BaseBackendOptions = BaseBackendOptions(),
26
+ ):
16
27
  self.file = in_doc.file
17
28
  self.path_or_stream = path_or_stream
18
29
  self.document_hash = in_doc.document_hash
19
30
  self.input_format = in_doc.format
31
+ self.options = options
20
32
 
21
33
  @abstractmethod
22
34
  def is_valid(self) -> bool:
@@ -35,7 +47,7 @@ class AbstractDocumentBackend(ABC):
35
47
 
36
48
  @classmethod
37
49
  @abstractmethod
38
- def supported_formats(cls) -> Set["InputFormat"]:
50
+ def supported_formats(cls) -> set["InputFormat"]:
39
51
  pass
40
52
 
41
53
 
@@ -58,6 +70,15 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
58
70
  straight without a recognition pipeline.
59
71
  """
60
72
 
73
+ @abstractmethod
74
+ def __init__(
75
+ self,
76
+ in_doc: "InputDocument",
77
+ path_or_stream: Union[BytesIO, Path],
78
+ options: BackendOptions = DeclarativeBackendOptions(),
79
+ ) -> None:
80
+ super().__init__(in_doc, path_or_stream, options)
81
+
61
82
  @abstractmethod
62
83
  def convert(self) -> DoclingDocument:
63
84
  pass
@@ -2,7 +2,7 @@ import logging
2
2
  import re
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import Final, Set, Union
5
+ from typing import Final, Union
6
6
 
7
7
  from docling_core.types.doc import (
8
8
  DocItemLabel,
@@ -27,7 +27,7 @@ DEFAULT_IMAGE_HEIGHT: Final = 128
27
27
 
28
28
 
29
29
  class AsciiDocBackend(DeclarativeDocumentBackend):
30
- def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
30
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
31
31
  super().__init__(in_doc, path_or_stream)
32
32
 
33
33
  self.path_or_stream = path_or_stream
@@ -58,7 +58,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
58
58
  return
59
59
 
60
60
  @classmethod
61
- def supported_formats(cls) -> Set[InputFormat]:
61
+ def supported_formats(cls) -> set[InputFormat]:
62
62
  return {InputFormat.ASCIIDOC}
63
63
 
64
64
  def convert(self) -> DoclingDocument:
@@ -12,6 +12,7 @@ from PIL import Image
12
12
  from pypdfium2 import PdfPage
13
13
 
14
14
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
15
+ from docling.datamodel.backend_options import PdfBackendOptions
15
16
  from docling.datamodel.base_models import Size
16
17
  from docling.utils.locks import pypdfium2_lock
17
18
 
@@ -189,13 +190,23 @@ class DoclingParseV4PageBackend(PdfPageBackend):
189
190
 
190
191
 
191
192
  class DoclingParseV4DocumentBackend(PdfDocumentBackend):
192
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
193
- super().__init__(in_doc, path_or_stream)
193
+ def __init__(
194
+ self,
195
+ in_doc: "InputDocument",
196
+ path_or_stream: Union[BytesIO, Path],
197
+ options: PdfBackendOptions = PdfBackendOptions(),
198
+ ):
199
+ super().__init__(in_doc, path_or_stream, options)
194
200
 
201
+ password = (
202
+ self.options.password.get_secret_value() if self.options.password else None
203
+ )
195
204
  with pypdfium2_lock:
196
- self._pdoc = pdfium.PdfDocument(self.path_or_stream)
205
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
197
206
  self.parser = DoclingPdfParser(loglevel="fatal")
198
- self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
207
+ self.dp_doc: PdfDocument = self.parser.load(
208
+ path_or_stream=self.path_or_stream, password=password
209
+ )
199
210
  success = self.dp_doc is not None
200
211
 
201
212
  if not success:
@@ -1,13 +1,16 @@
1
+ import base64
1
2
  import logging
3
+ import os
2
4
  import re
3
- import traceback
5
+ import warnings
4
6
  from contextlib import contextmanager
5
7
  from copy import deepcopy
6
8
  from io import BytesIO
7
9
  from pathlib import Path
8
10
  from typing import Final, Optional, Union, cast
9
- from urllib.parse import urljoin
11
+ from urllib.parse import urljoin, urlparse
10
12
 
13
+ import requests
11
14
  from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
12
15
  from bs4.element import PreformattedString
13
16
  from docling_core.types.doc import (
@@ -17,6 +20,7 @@ from docling_core.types.doc import (
17
20
  DocumentOrigin,
18
21
  GroupItem,
19
22
  GroupLabel,
23
+ PictureItem,
20
24
  RefItem,
21
25
  RichTableCell,
22
26
  TableCell,
@@ -24,13 +28,18 @@ from docling_core.types.doc import (
24
28
  TableItem,
25
29
  TextItem,
26
30
  )
27
- from docling_core.types.doc.document import ContentLayer, Formatting, Script
31
+ from docling_core.types.doc.document import ContentLayer, Formatting, ImageRef, Script
32
+ from PIL import Image, UnidentifiedImageError
28
33
  from pydantic import AnyUrl, BaseModel, ValidationError
29
34
  from typing_extensions import override
30
35
 
31
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
36
+ from docling.backend.abstract_backend import (
37
+ DeclarativeDocumentBackend,
38
+ )
39
+ from docling.datamodel.backend_options import HTMLBackendOptions
32
40
  from docling.datamodel.base_models import InputFormat
33
41
  from docling.datamodel.document import InputDocument
42
+ from docling.exceptions import OperationNotAllowed
34
43
 
35
44
  _log = logging.getLogger(__name__)
36
45
 
@@ -43,6 +52,7 @@ _BLOCK_TAGS: Final = {
43
52
  "details",
44
53
  "figure",
45
54
  "footer",
55
+ "img",
46
56
  "h1",
47
57
  "h2",
48
58
  "h3",
@@ -186,11 +196,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
186
196
  self,
187
197
  in_doc: InputDocument,
188
198
  path_or_stream: Union[BytesIO, Path],
189
- original_url: Optional[AnyUrl] = None,
199
+ options: HTMLBackendOptions = HTMLBackendOptions(),
190
200
  ):
191
- super().__init__(in_doc, path_or_stream)
201
+ super().__init__(in_doc, path_or_stream, options)
192
202
  self.soup: Optional[Tag] = None
193
- self.path_or_stream = path_or_stream
203
+ self.path_or_stream: Union[BytesIO, Path] = path_or_stream
204
+ self.base_path: Optional[str] = str(options.source_uri)
194
205
 
195
206
  # Initialize the parents for the hierarchy
196
207
  self.max_levels = 10
@@ -200,7 +211,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
200
211
  for i in range(self.max_levels):
201
212
  self.parents[i] = None
202
213
  self.hyperlink: Union[AnyUrl, Path, None] = None
203
- self.original_url = original_url
204
214
  self.format_tags: list[str] = []
205
215
 
206
216
  try:
@@ -261,7 +271,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
261
271
  content_layer=ContentLayer.FURNITURE,
262
272
  )
263
273
  # remove script and style tags
264
- for tag in self.soup(["script", "style"]):
274
+ for tag in self.soup(["script", "noscript", "style"]):
265
275
  tag.decompose()
266
276
  # remove any hidden tag
267
277
  for tag in self.soup(hidden=True):
@@ -291,6 +301,28 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
291
301
  self._walk(content, doc)
292
302
  return doc
293
303
 
304
+ @staticmethod
305
+ def _is_remote_url(value: str) -> bool:
306
+ parsed = urlparse(value)
307
+ return parsed.scheme in {"http", "https", "ftp", "s3", "gs"}
308
+
309
+ def _resolve_relative_path(self, loc: str) -> str:
310
+ abs_loc = loc
311
+
312
+ if self.base_path:
313
+ if loc.startswith("//"):
314
+ # Protocol-relative URL - default to https
315
+ abs_loc = "https:" + loc
316
+ elif not loc.startswith(("http://", "https://", "data:", "file://")):
317
+ if HTMLDocumentBackend._is_remote_url(self.base_path): # remote fetch
318
+ abs_loc = urljoin(self.base_path, loc)
319
+ elif self.base_path: # local fetch
320
+ # For local files, resolve relative to the HTML file location
321
+ abs_loc = str(Path(self.base_path).parent / loc)
322
+
323
+ _log.debug(f"Resolved location {loc} to {abs_loc}")
324
+ return abs_loc
325
+
294
326
  @staticmethod
295
327
  def group_cell_elements(
296
328
  group_name: str,
@@ -520,7 +552,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
520
552
  if name == "img":
521
553
  flush_buffer()
522
554
  im_ref3 = self._emit_image(node, doc)
523
- added_refs.append(im_ref3)
555
+ if im_ref3:
556
+ added_refs.append(im_ref3)
524
557
  elif name in _FORMAT_TAG_MAP:
525
558
  with self._use_format([name]):
526
559
  wk = self._walk(node, doc)
@@ -669,8 +702,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
669
702
  else:
670
703
  if isinstance(this_href, str) and this_href:
671
704
  old_hyperlink = self.hyperlink
672
- if self.original_url is not None:
673
- this_href = urljoin(str(self.original_url), str(this_href))
705
+ this_href = self._resolve_relative_path(this_href)
674
706
  # ugly fix for relative links since pydantic does not support them.
675
707
  try:
676
708
  new_hyperlink = AnyUrl(this_href)
@@ -837,7 +869,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
837
869
  for img_tag in tag("img"):
838
870
  if isinstance(img_tag, Tag):
839
871
  im_ref = self._emit_image(img_tag, doc)
840
- added_ref.append(im_ref)
872
+ if im_ref:
873
+ added_ref.append(im_ref)
841
874
  return added_ref
842
875
 
843
876
  def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
@@ -1003,7 +1036,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
1003
1036
  img_tag = tag.find("img")
1004
1037
  if isinstance(img_tag, Tag):
1005
1038
  im_ref = self._emit_image(img_tag, doc)
1006
- added_refs.append(im_ref)
1039
+ if im_ref is not None:
1040
+ added_refs.append(im_ref)
1007
1041
 
1008
1042
  elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
1009
1043
  heading_refs = self._handle_heading(tag, doc)
@@ -1061,7 +1095,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
1061
1095
  for img_tag in tag("img"):
1062
1096
  if isinstance(img_tag, Tag):
1063
1097
  im_ref2 = self._emit_image(tag, doc)
1064
- added_refs.append(im_ref2)
1098
+ if im_ref2 is not None:
1099
+ added_refs.append(im_ref2)
1065
1100
 
1066
1101
  elif tag_name in {"pre"}:
1067
1102
  # handle monospace code snippets (pre).
@@ -1092,10 +1127,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
1092
1127
  self._walk(tag, doc)
1093
1128
  return added_refs
1094
1129
 
1095
- def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
1130
+ def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]:
1096
1131
  figure = img_tag.find_parent("figure")
1097
1132
  caption: AnnotatedTextList = AnnotatedTextList()
1098
1133
 
1134
+ parent = self.parents[self.level]
1135
+
1099
1136
  # check if the figure has a link - this is HACK:
1100
1137
  def get_img_hyperlink(img_tag):
1101
1138
  this_parent = img_tag.parent
@@ -1106,9 +1143,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
1106
1143
  return None
1107
1144
 
1108
1145
  if img_hyperlink := get_img_hyperlink(img_tag):
1109
- caption.append(
1110
- AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
1111
- )
1146
+ img_text = img_tag.get("alt") or ""
1147
+ caption.append(AnnotatedText(text=img_text, hyperlink=img_hyperlink))
1112
1148
 
1113
1149
  if isinstance(figure, Tag):
1114
1150
  caption_tag = figure.find("figcaption", recursive=False)
@@ -1135,13 +1171,78 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
1135
1171
  hyperlink=caption_anno_text.hyperlink,
1136
1172
  )
1137
1173
 
1174
+ src_loc: str = self._get_attr_as_string(img_tag, "src")
1175
+ if not cast(HTMLBackendOptions, self.options).fetch_images or not src_loc:
1176
+ # Do not fetch the image, just add a placeholder
1177
+ placeholder: PictureItem = doc.add_picture(
1178
+ caption=caption_item,
1179
+ parent=parent,
1180
+ content_layer=self.content_layer,
1181
+ )
1182
+ return placeholder.get_ref()
1183
+
1184
+ src_loc = self._resolve_relative_path(src_loc)
1185
+ img_ref = self._create_image_ref(src_loc)
1186
+
1138
1187
  docling_pic = doc.add_picture(
1188
+ image=img_ref,
1139
1189
  caption=caption_item,
1140
- parent=self.parents[self.level],
1190
+ parent=parent,
1141
1191
  content_layer=self.content_layer,
1142
1192
  )
1143
1193
  return docling_pic.get_ref()
1144
1194
 
1195
+ def _create_image_ref(self, src_url: str) -> Optional[ImageRef]:
1196
+ try:
1197
+ img_data = self._load_image_data(src_url)
1198
+ if img_data:
1199
+ img = Image.open(BytesIO(img_data))
1200
+ return ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
1201
+ except (
1202
+ requests.HTTPError,
1203
+ ValidationError,
1204
+ UnidentifiedImageError,
1205
+ OperationNotAllowed,
1206
+ TypeError,
1207
+ ValueError,
1208
+ ) as e:
1209
+ warnings.warn(f"Could not process an image from {src_url}: {e}")
1210
+
1211
+ return None
1212
+
1213
+ def _load_image_data(self, src_loc: str) -> Optional[bytes]:
1214
+ if src_loc.lower().endswith(".svg"):
1215
+ _log.debug(f"Skipping SVG file: {src_loc}")
1216
+ return None
1217
+
1218
+ if HTMLDocumentBackend._is_remote_url(src_loc):
1219
+ if not self.options.enable_remote_fetch:
1220
+ raise OperationNotAllowed(
1221
+ "Fetching remote resources is only allowed when set explicitly. "
1222
+ "Set options.enable_remote_fetch=True."
1223
+ )
1224
+ response = requests.get(src_loc, stream=True)
1225
+ response.raise_for_status()
1226
+ return response.content
1227
+ elif src_loc.startswith("data:"):
1228
+ data = re.sub(r"^data:image/.+;base64,", "", src_loc)
1229
+ return base64.b64decode(data)
1230
+
1231
+ if src_loc.startswith("file://"):
1232
+ src_loc = src_loc[7:]
1233
+
1234
+ if not self.options.enable_local_fetch:
1235
+ raise OperationNotAllowed(
1236
+ "Fetching local resources is only allowed when set explicitly. "
1237
+ "Set options.enable_local_fetch=True."
1238
+ )
1239
+ # add check that file exists and can read
1240
+ if os.path.isfile(src_loc) and os.access(src_loc, os.R_OK):
1241
+ with open(src_loc, "rb") as f:
1242
+ return f.read()
1243
+ else:
1244
+ raise ValueError("File does not exist or it is not readable.")
1245
+
1145
1246
  @staticmethod
1146
1247
  def get_text(item: PageElement) -> str:
1147
1248
  """Concatenate all child strings of a PageElement.
@@ -1238,3 +1339,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
1238
1339
  )
1239
1340
 
1240
1341
  return int_spans
1342
+
1343
+ @staticmethod
1344
+ def _get_attr_as_string(tag: Tag, attr: str, default: str = "") -> str:
1345
+ """Get attribute value as string, handling list values."""
1346
+ value = tag.get(attr)
1347
+ if not value:
1348
+ return default
1349
+
1350
+ return value[0] if isinstance(value, list) else value
@@ -24,10 +24,16 @@ from docling_core.types.doc import (
24
24
  from docling_core.types.doc.document import Formatting
25
25
  from marko import Markdown
26
26
  from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
27
- from typing_extensions import Annotated
27
+ from typing_extensions import Annotated, override
28
28
 
29
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
29
+ from docling.backend.abstract_backend import (
30
+ DeclarativeDocumentBackend,
31
+ )
30
32
  from docling.backend.html_backend import HTMLDocumentBackend
33
+ from docling.datamodel.backend_options import (
34
+ HTMLBackendOptions,
35
+ MarkdownBackendOptions,
36
+ )
31
37
  from docling.datamodel.base_models import InputFormat
32
38
  from docling.datamodel.document import InputDocument
33
39
 
@@ -88,8 +94,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
88
94
 
89
95
  return shortened_text
90
96
 
91
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
92
- super().__init__(in_doc, path_or_stream)
97
+ @override
98
+ def __init__(
99
+ self,
100
+ in_doc: InputDocument,
101
+ path_or_stream: Union[BytesIO, Path],
102
+ options: MarkdownBackendOptions = MarkdownBackendOptions(),
103
+ ):
104
+ super().__init__(in_doc, path_or_stream, options)
93
105
 
94
106
  _log.debug("Starting MarkdownDocumentBackend...")
95
107
 
@@ -575,14 +587,24 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
575
587
  self._html_blocks = 0
576
588
  # delegate to HTML backend
577
589
  stream = BytesIO(bytes(html_str, encoding="utf-8"))
590
+ md_options = cast(MarkdownBackendOptions, self.options)
591
+ html_options = HTMLBackendOptions(
592
+ enable_local_fetch=md_options.enable_local_fetch,
593
+ enable_remote_fetch=md_options.enable_remote_fetch,
594
+ fetch_images=md_options.fetch_images,
595
+ source_uri=md_options.source_uri,
596
+ )
578
597
  in_doc = InputDocument(
579
598
  path_or_stream=stream,
580
599
  format=InputFormat.HTML,
581
600
  backend=html_backend_cls,
582
601
  filename=self.file.name,
602
+ backend_options=html_options,
583
603
  )
584
604
  html_backend_obj = html_backend_cls(
585
- in_doc=in_doc, path_or_stream=stream
605
+ in_doc=in_doc,
606
+ path_or_stream=stream,
607
+ options=html_options,
586
608
  )
587
609
  doc = html_backend_obj.convert()
588
610
  else: