docling 2.42.0__tar.gz → 2.42.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {docling-2.42.0 → docling-2.42.2}/PKG-INFO +2 -1
  2. {docling-2.42.0 → docling-2.42.2}/README.md +1 -0
  3. docling-2.42.2/docling/backend/html_backend.py +570 -0
  4. {docling-2.42.0 → docling-2.42.2}/docling/backend/msword_backend.py +10 -1
  5. {docling-2.42.0 → docling-2.42.2}/docling/backend/pdf_backend.py +25 -1
  6. {docling-2.42.0 → docling-2.42.2}/docling/pipeline/base_pipeline.py +7 -1
  7. {docling-2.42.0 → docling-2.42.2}/docling/utils/layout_postprocessor.py +7 -2
  8. {docling-2.42.0 → docling-2.42.2}/docling.egg-info/PKG-INFO +2 -1
  9. {docling-2.42.0 → docling-2.42.2}/pyproject.toml +1 -1
  10. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_html.py +2 -6
  11. {docling-2.42.0 → docling-2.42.2}/tests/test_input_doc.py +23 -0
  12. docling-2.42.0/docling/backend/html_backend.py +0 -585
  13. {docling-2.42.0 → docling-2.42.2}/LICENSE +0 -0
  14. {docling-2.42.0 → docling-2.42.2}/docling/__init__.py +0 -0
  15. {docling-2.42.0 → docling-2.42.2}/docling/backend/__init__.py +0 -0
  16. {docling-2.42.0 → docling-2.42.2}/docling/backend/abstract_backend.py +0 -0
  17. {docling-2.42.0 → docling-2.42.2}/docling/backend/asciidoc_backend.py +0 -0
  18. {docling-2.42.0 → docling-2.42.2}/docling/backend/csv_backend.py +0 -0
  19. {docling-2.42.0 → docling-2.42.2}/docling/backend/docling_parse_backend.py +0 -0
  20. {docling-2.42.0 → docling-2.42.2}/docling/backend/docling_parse_v2_backend.py +0 -0
  21. {docling-2.42.0 → docling-2.42.2}/docling/backend/docling_parse_v4_backend.py +0 -0
  22. {docling-2.42.0 → docling-2.42.2}/docling/backend/docx/__init__.py +0 -0
  23. {docling-2.42.0 → docling-2.42.2}/docling/backend/docx/latex/__init__.py +0 -0
  24. {docling-2.42.0 → docling-2.42.2}/docling/backend/docx/latex/latex_dict.py +0 -0
  25. {docling-2.42.0 → docling-2.42.2}/docling/backend/docx/latex/omml.py +0 -0
  26. {docling-2.42.0 → docling-2.42.2}/docling/backend/json/__init__.py +0 -0
  27. {docling-2.42.0 → docling-2.42.2}/docling/backend/json/docling_json_backend.py +0 -0
  28. {docling-2.42.0 → docling-2.42.2}/docling/backend/md_backend.py +0 -0
  29. {docling-2.42.0 → docling-2.42.2}/docling/backend/msexcel_backend.py +0 -0
  30. {docling-2.42.0 → docling-2.42.2}/docling/backend/mspowerpoint_backend.py +0 -0
  31. {docling-2.42.0 → docling-2.42.2}/docling/backend/noop_backend.py +0 -0
  32. {docling-2.42.0 → docling-2.42.2}/docling/backend/pypdfium2_backend.py +0 -0
  33. {docling-2.42.0 → docling-2.42.2}/docling/backend/xml/__init__.py +0 -0
  34. {docling-2.42.0 → docling-2.42.2}/docling/backend/xml/jats_backend.py +0 -0
  35. {docling-2.42.0 → docling-2.42.2}/docling/backend/xml/uspto_backend.py +0 -0
  36. {docling-2.42.0 → docling-2.42.2}/docling/chunking/__init__.py +0 -0
  37. {docling-2.42.0 → docling-2.42.2}/docling/cli/__init__.py +0 -0
  38. {docling-2.42.0 → docling-2.42.2}/docling/cli/main.py +0 -0
  39. {docling-2.42.0 → docling-2.42.2}/docling/cli/models.py +0 -0
  40. {docling-2.42.0 → docling-2.42.2}/docling/cli/tools.py +0 -0
  41. {docling-2.42.0 → docling-2.42.2}/docling/datamodel/__init__.py +0 -0
  42. {docling-2.42.0 → docling-2.42.2}/docling/datamodel/accelerator_options.py +0 -0
  43. {docling-2.42.0 → docling-2.42.2}/docling/datamodel/asr_model_specs.py +0 -0
  44. {docling-2.42.0 → docling-2.42.2}/docling/datamodel/base_models.py +0 -0
  45. {docling-2.42.0 → docling-2.42.2}/docling/datamodel/document.py +0 -0
  46. {docling-2.42.0 → docling-2.42.2}/docling/datamodel/layout_model_specs.py +0 -0
  47. {docling-2.42.0 → docling-2.42.2}/docling/datamodel/pipeline_options.py +0 -0
  48. {docling-2.42.0 → docling-2.42.2}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  49. {docling-2.42.0 → docling-2.42.2}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  50. {docling-2.42.0 → docling-2.42.2}/docling/datamodel/settings.py +0 -0
  51. {docling-2.42.0 → docling-2.42.2}/docling/datamodel/vlm_model_specs.py +0 -0
  52. {docling-2.42.0 → docling-2.42.2}/docling/document_converter.py +0 -0
  53. {docling-2.42.0 → docling-2.42.2}/docling/exceptions.py +0 -0
  54. {docling-2.42.0 → docling-2.42.2}/docling/models/__init__.py +0 -0
  55. {docling-2.42.0 → docling-2.42.2}/docling/models/api_vlm_model.py +0 -0
  56. {docling-2.42.0 → docling-2.42.2}/docling/models/base_model.py +0 -0
  57. {docling-2.42.0 → docling-2.42.2}/docling/models/base_ocr_model.py +0 -0
  58. {docling-2.42.0 → docling-2.42.2}/docling/models/code_formula_model.py +0 -0
  59. {docling-2.42.0 → docling-2.42.2}/docling/models/document_picture_classifier.py +0 -0
  60. {docling-2.42.0 → docling-2.42.2}/docling/models/easyocr_model.py +0 -0
  61. {docling-2.42.0 → docling-2.42.2}/docling/models/factories/__init__.py +0 -0
  62. {docling-2.42.0 → docling-2.42.2}/docling/models/factories/base_factory.py +0 -0
  63. {docling-2.42.0 → docling-2.42.2}/docling/models/factories/ocr_factory.py +0 -0
  64. {docling-2.42.0 → docling-2.42.2}/docling/models/factories/picture_description_factory.py +0 -0
  65. {docling-2.42.0 → docling-2.42.2}/docling/models/layout_model.py +0 -0
  66. {docling-2.42.0 → docling-2.42.2}/docling/models/ocr_mac_model.py +0 -0
  67. {docling-2.42.0 → docling-2.42.2}/docling/models/page_assemble_model.py +0 -0
  68. {docling-2.42.0 → docling-2.42.2}/docling/models/page_preprocessing_model.py +0 -0
  69. {docling-2.42.0 → docling-2.42.2}/docling/models/picture_description_api_model.py +0 -0
  70. {docling-2.42.0 → docling-2.42.2}/docling/models/picture_description_base_model.py +0 -0
  71. {docling-2.42.0 → docling-2.42.2}/docling/models/picture_description_vlm_model.py +0 -0
  72. {docling-2.42.0 → docling-2.42.2}/docling/models/plugins/__init__.py +0 -0
  73. {docling-2.42.0 → docling-2.42.2}/docling/models/plugins/defaults.py +0 -0
  74. {docling-2.42.0 → docling-2.42.2}/docling/models/rapid_ocr_model.py +0 -0
  75. {docling-2.42.0 → docling-2.42.2}/docling/models/readingorder_model.py +0 -0
  76. {docling-2.42.0 → docling-2.42.2}/docling/models/table_structure_model.py +0 -0
  77. {docling-2.42.0 → docling-2.42.2}/docling/models/tesseract_ocr_cli_model.py +0 -0
  78. {docling-2.42.0 → docling-2.42.2}/docling/models/tesseract_ocr_model.py +0 -0
  79. {docling-2.42.0 → docling-2.42.2}/docling/models/utils/__init__.py +0 -0
  80. {docling-2.42.0 → docling-2.42.2}/docling/models/utils/hf_model_download.py +0 -0
  81. {docling-2.42.0 → docling-2.42.2}/docling/models/vlm_models_inline/__init__.py +0 -0
  82. {docling-2.42.0 → docling-2.42.2}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  83. {docling-2.42.0 → docling-2.42.2}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  84. {docling-2.42.0 → docling-2.42.2}/docling/pipeline/__init__.py +0 -0
  85. {docling-2.42.0 → docling-2.42.2}/docling/pipeline/asr_pipeline.py +0 -0
  86. {docling-2.42.0 → docling-2.42.2}/docling/pipeline/simple_pipeline.py +0 -0
  87. {docling-2.42.0 → docling-2.42.2}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  88. {docling-2.42.0 → docling-2.42.2}/docling/pipeline/vlm_pipeline.py +0 -0
  89. {docling-2.42.0 → docling-2.42.2}/docling/py.typed +0 -0
  90. {docling-2.42.0 → docling-2.42.2}/docling/utils/__init__.py +0 -0
  91. {docling-2.42.0 → docling-2.42.2}/docling/utils/accelerator_utils.py +0 -0
  92. {docling-2.42.0 → docling-2.42.2}/docling/utils/api_image_request.py +0 -0
  93. {docling-2.42.0 → docling-2.42.2}/docling/utils/export.py +0 -0
  94. {docling-2.42.0 → docling-2.42.2}/docling/utils/glm_utils.py +0 -0
  95. {docling-2.42.0 → docling-2.42.2}/docling/utils/locks.py +0 -0
  96. {docling-2.42.0 → docling-2.42.2}/docling/utils/model_downloader.py +0 -0
  97. {docling-2.42.0 → docling-2.42.2}/docling/utils/ocr_utils.py +0 -0
  98. {docling-2.42.0 → docling-2.42.2}/docling/utils/orientation.py +0 -0
  99. {docling-2.42.0 → docling-2.42.2}/docling/utils/profiling.py +0 -0
  100. {docling-2.42.0 → docling-2.42.2}/docling/utils/utils.py +0 -0
  101. {docling-2.42.0 → docling-2.42.2}/docling/utils/visualization.py +0 -0
  102. {docling-2.42.0 → docling-2.42.2}/docling.egg-info/SOURCES.txt +0 -0
  103. {docling-2.42.0 → docling-2.42.2}/docling.egg-info/dependency_links.txt +0 -0
  104. {docling-2.42.0 → docling-2.42.2}/docling.egg-info/entry_points.txt +0 -0
  105. {docling-2.42.0 → docling-2.42.2}/docling.egg-info/requires.txt +0 -0
  106. {docling-2.42.0 → docling-2.42.2}/docling.egg-info/top_level.txt +0 -0
  107. {docling-2.42.0 → docling-2.42.2}/setup.cfg +0 -0
  108. {docling-2.42.0 → docling-2.42.2}/tests/test_asr_pipeline.py +0 -0
  109. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_asciidoc.py +0 -0
  110. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_csv.py +0 -0
  111. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_docling_json.py +0 -0
  112. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_docling_parse.py +0 -0
  113. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_docling_parse_v2.py +0 -0
  114. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_docling_parse_v4.py +0 -0
  115. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_jats.py +0 -0
  116. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_markdown.py +0 -0
  117. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_msexcel.py +0 -0
  118. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_msword.py +0 -0
  119. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_patent_uspto.py +0 -0
  120. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_pdfium.py +0 -0
  121. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_pptx.py +0 -0
  122. {docling-2.42.0 → docling-2.42.2}/tests/test_backend_webp.py +0 -0
  123. {docling-2.42.0 → docling-2.42.2}/tests/test_cli.py +0 -0
  124. {docling-2.42.0 → docling-2.42.2}/tests/test_code_formula.py +0 -0
  125. {docling-2.42.0 → docling-2.42.2}/tests/test_data_gen_flag.py +0 -0
  126. {docling-2.42.0 → docling-2.42.2}/tests/test_document_picture_classifier.py +0 -0
  127. {docling-2.42.0 → docling-2.42.2}/tests/test_e2e_conversion.py +0 -0
  128. {docling-2.42.0 → docling-2.42.2}/tests/test_e2e_ocr_conversion.py +0 -0
  129. {docling-2.42.0 → docling-2.42.2}/tests/test_interfaces.py +0 -0
  130. {docling-2.42.0 → docling-2.42.2}/tests/test_invalid_input.py +0 -0
  131. {docling-2.42.0 → docling-2.42.2}/tests/test_legacy_format_transform.py +0 -0
  132. {docling-2.42.0 → docling-2.42.2}/tests/test_ocr_utils.py +0 -0
  133. {docling-2.42.0 → docling-2.42.2}/tests/test_options.py +0 -0
  134. {docling-2.42.0 → docling-2.42.2}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.42.0
3
+ Version: 2.42.2
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -89,6 +89,7 @@ Dynamic: license-file
89
89
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
90
90
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
91
91
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
92
+ [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
92
93
  [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
93
94
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
94
95
 
@@ -21,6 +21,7 @@
21
21
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
22
22
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
23
23
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
24
+ [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
24
25
  [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
25
26
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
26
27
 
@@ -0,0 +1,570 @@
1
+ import logging
2
+ import re
3
+ import traceback
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from typing import Final, Optional, Union, cast
7
+
8
+ from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
9
+ from bs4.element import PreformattedString
10
+ from docling_core.types.doc import (
11
+ DocItem,
12
+ DocItemLabel,
13
+ DoclingDocument,
14
+ DocumentOrigin,
15
+ GroupItem,
16
+ GroupLabel,
17
+ TableCell,
18
+ TableData,
19
+ TextItem,
20
+ )
21
+ from docling_core.types.doc.document import ContentLayer
22
+ from pydantic import BaseModel
23
+ from typing_extensions import override
24
+
25
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
26
+ from docling.datamodel.base_models import InputFormat
27
+ from docling.datamodel.document import InputDocument
28
+
29
+ _log = logging.getLogger(__name__)
30
+
31
+ DEFAULT_IMAGE_WIDTH = 128
32
+ DEFAULT_IMAGE_HEIGHT = 128
33
+
34
+ # Tags that initiate distinct Docling items
35
+ _BLOCK_TAGS: Final = {
36
+ "address",
37
+ "details",
38
+ "figure",
39
+ "h1",
40
+ "h2",
41
+ "h3",
42
+ "h4",
43
+ "h5",
44
+ "h6",
45
+ "p",
46
+ "pre",
47
+ "code",
48
+ "ul",
49
+ "ol",
50
+ "summary",
51
+ "table",
52
+ }
53
+
54
+
55
+ class _Context(BaseModel):
56
+ list_ordered_flag_by_ref: dict[str, bool] = {}
57
+ list_start_by_ref: dict[str, int] = {}
58
+
59
+
60
+ class HTMLDocumentBackend(DeclarativeDocumentBackend):
61
+ @override
62
+ def __init__(
63
+ self,
64
+ in_doc: InputDocument,
65
+ path_or_stream: Union[BytesIO, Path],
66
+ ):
67
+ super().__init__(in_doc, path_or_stream)
68
+ self.soup: Optional[Tag] = None
69
+ self.path_or_stream = path_or_stream
70
+
71
+ # Initialize the parents for the hierarchy
72
+ self.max_levels = 10
73
+ self.level = 0
74
+ self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
75
+ self.ctx = _Context()
76
+ for i in range(self.max_levels):
77
+ self.parents[i] = None
78
+
79
+ try:
80
+ raw = (
81
+ path_or_stream.getvalue()
82
+ if isinstance(path_or_stream, BytesIO)
83
+ else Path(path_or_stream).read_bytes()
84
+ )
85
+ self.soup = BeautifulSoup(raw, "html.parser")
86
+ except Exception as e:
87
+ raise RuntimeError(
88
+ "Could not initialize HTML backend for file with "
89
+ f"hash {self.document_hash}."
90
+ ) from e
91
+
92
+ @override
93
+ def is_valid(self) -> bool:
94
+ return self.soup is not None
95
+
96
+ @classmethod
97
+ @override
98
+ def supports_pagination(cls) -> bool:
99
+ return False
100
+
101
+ @override
102
+ def unload(self):
103
+ if isinstance(self.path_or_stream, BytesIO):
104
+ self.path_or_stream.close()
105
+ self.path_or_stream = None
106
+
107
+ @classmethod
108
+ @override
109
+ def supported_formats(cls) -> set[InputFormat]:
110
+ return {InputFormat.HTML}
111
+
112
+ @override
113
+ def convert(self) -> DoclingDocument:
114
+ _log.debug("Starting HTML conversion...")
115
+ if not self.is_valid():
116
+ raise RuntimeError("Invalid HTML document.")
117
+
118
+ origin = DocumentOrigin(
119
+ filename=self.file.name or "file",
120
+ mimetype="text/html",
121
+ binary_hash=self.document_hash,
122
+ )
123
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
124
+
125
+ assert self.soup is not None
126
+ # set the title as furniture, since it is part of the document metadata
127
+ title = self.soup.title
128
+ if title:
129
+ doc.add_title(
130
+ text=title.get_text(separator=" ", strip=True),
131
+ content_layer=ContentLayer.FURNITURE,
132
+ )
133
+ # remove scripts/styles
134
+ for tag in self.soup(["script", "style"]):
135
+ tag.decompose()
136
+ content = self.soup.body or self.soup
137
+ # normalize <br> tags
138
+ for br in content("br"):
139
+ br.replace_with(NavigableString("\n"))
140
+ # set default content layer
141
+ headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
142
+ self.content_layer = (
143
+ ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
144
+ )
145
+ # reset context
146
+ self.ctx = _Context()
147
+
148
+ try:
149
+ self._walk(content, doc)
150
+ except Exception:
151
+ print(traceback.format_exc())
152
+
153
+ return doc
154
+
155
+ def _walk(self, element: Tag, doc: DoclingDocument) -> None:
156
+ """Parse an XML tag by recursively walking its content.
157
+
158
+ While walking, the method buffers inline text across tags like <b> or <span>,
159
+ emitting text nodes only at block boundaries.
160
+
161
+ Args:
162
+ element: The XML tag to parse.
163
+ doc: The Docling document to be updated with the parsed content.
164
+ """
165
+ buffer: list[str] = []
166
+
167
+ def flush_buffer():
168
+ if not buffer:
169
+ return
170
+ text = "".join(buffer).strip()
171
+ buffer.clear()
172
+ if not text:
173
+ return
174
+ for part in text.split("\n"):
175
+ seg = part.strip()
176
+ if seg:
177
+ doc.add_text(
178
+ DocItemLabel.TEXT,
179
+ seg,
180
+ parent=self.parents[self.level],
181
+ content_layer=self.content_layer,
182
+ )
183
+
184
+ for node in element.contents:
185
+ if isinstance(node, Tag):
186
+ name = node.name.lower()
187
+ if name == "img":
188
+ flush_buffer()
189
+ self._emit_image(node, doc)
190
+ elif name in _BLOCK_TAGS:
191
+ flush_buffer()
192
+ self._handle_block(node, doc)
193
+ elif node.find(_BLOCK_TAGS):
194
+ flush_buffer()
195
+ self._walk(node, doc)
196
+ else:
197
+ buffer.append(node.text)
198
+ elif isinstance(node, NavigableString) and not isinstance(
199
+ node, PreformattedString
200
+ ):
201
+ buffer.append(str(node))
202
+
203
+ flush_buffer()
204
+
205
+ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
206
+ tag_name = tag.name.lower()
207
+ # set default content layer to BODY as soon as we encounter a heading
208
+ self.content_layer = ContentLayer.BODY
209
+ level = int(tag_name[1])
210
+ text = tag.get_text(strip=True, separator=" ")
211
+ # the first level is for the title item
212
+ if level == 1:
213
+ for key in self.parents.keys():
214
+ self.parents[key] = None
215
+ self.level = 0
216
+ self.parents[self.level + 1] = doc.add_title(
217
+ text, content_layer=self.content_layer
218
+ )
219
+ # the other levels need to be lowered by 1 if a title was set
220
+ else:
221
+ level -= 1
222
+ if level > self.level:
223
+ # add invisible group
224
+ for i in range(self.level, level):
225
+ _log.debug(f"Adding invisible group to level {i}")
226
+ self.parents[i + 1] = doc.add_group(
227
+ name=f"header-{i + 1}",
228
+ label=GroupLabel.SECTION,
229
+ parent=self.parents[i],
230
+ content_layer=self.content_layer,
231
+ )
232
+ self.level = level
233
+ elif level < self.level:
234
+ # remove the tail
235
+ for key in self.parents.keys():
236
+ if key > level + 1:
237
+ _log.debug(f"Remove the tail of level {key}")
238
+ self.parents[key] = None
239
+ self.level = level
240
+ self.parents[self.level + 1] = doc.add_heading(
241
+ parent=self.parents[self.level],
242
+ text=text,
243
+ level=self.level,
244
+ content_layer=self.content_layer,
245
+ )
246
+ self.level += 1
247
+ for img_tag in tag("img"):
248
+ if isinstance(img_tag, Tag):
249
+ self._emit_image(img_tag, doc)
250
+
251
+ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> None:
252
+ tag_name = tag.name.lower()
253
+ start: Optional[int] = None
254
+ name: str = ""
255
+ is_ordered = tag_name == "ol"
256
+ if is_ordered:
257
+ start_attr = tag.get("start")
258
+ if isinstance(start_attr, str) and start_attr.isnumeric():
259
+ start = int(start_attr)
260
+ name = "ordered list" + (f" start {start}" if start is not None else "")
261
+ else:
262
+ name = "list"
263
+ # Create the list container
264
+ list_group = doc.add_list_group(
265
+ name=name,
266
+ parent=self.parents[self.level],
267
+ content_layer=self.content_layer,
268
+ )
269
+ self.parents[self.level + 1] = list_group
270
+ self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
271
+ if is_ordered and start is not None:
272
+ self.ctx.list_start_by_ref[list_group.self_ref] = start
273
+ self.level += 1
274
+
275
+ # For each top-level <li> in this list
276
+ for li in tag.find_all({"li", "ul", "ol"}, recursive=False):
277
+ if not isinstance(li, Tag):
278
+ continue
279
+
280
+ # sub-list items should be indented under main list items, but temporarily
281
+ # addressing invalid HTML (docling-core/issues/357)
282
+ if li.name in {"ul", "ol"}:
283
+ self._handle_block(li, doc)
284
+
285
+ else:
286
+ # 1) determine the marker
287
+ if is_ordered and start is not None:
288
+ marker = f"{start + len(list_group.children)}."
289
+ else:
290
+ marker = ""
291
+
292
+ # 2) extract only the "direct" text from this <li>
293
+ parts: list[str] = []
294
+ for child in li.contents:
295
+ if isinstance(child, NavigableString) and not isinstance(
296
+ child, PreformattedString
297
+ ):
298
+ parts.append(child)
299
+ elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
300
+ text_part = HTMLDocumentBackend.get_text(child)
301
+ if text_part:
302
+ parts.append(text_part)
303
+ li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
304
+
305
+ # 3) add the list item
306
+ if li_text:
307
+ self.parents[self.level + 1] = doc.add_list_item(
308
+ text=li_text,
309
+ enumerated=is_ordered,
310
+ marker=marker,
311
+ parent=list_group,
312
+ content_layer=self.content_layer,
313
+ )
314
+
315
+ # 4) recurse into any nested lists, attaching them to this <li> item
316
+ for sublist in li({"ul", "ol"}, recursive=False):
317
+ if isinstance(sublist, Tag):
318
+ self.level += 1
319
+ self._handle_block(sublist, doc)
320
+ self.parents[self.level + 1] = None
321
+ self.level -= 1
322
+ else:
323
+ for sublist in li({"ul", "ol"}, recursive=False):
324
+ if isinstance(sublist, Tag):
325
+ self._handle_block(sublist, doc)
326
+
327
+ # 5) extract any images under this <li>
328
+ for img_tag in li("img"):
329
+ if isinstance(img_tag, Tag):
330
+ self._emit_image(img_tag, doc)
331
+
332
+ self.parents[self.level + 1] = None
333
+ self.level -= 1
334
+
335
+ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> None:
336
+ tag_name = tag.name.lower()
337
+
338
+ if tag_name == "figure":
339
+ img_tag = tag.find("img")
340
+ if isinstance(img_tag, Tag):
341
+ self._emit_image(img_tag, doc)
342
+
343
+ elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
344
+ self._handle_heading(tag, doc)
345
+
346
+ elif tag_name in {"ul", "ol"}:
347
+ self._handle_list(tag, doc)
348
+
349
+ elif tag_name in {"p", "address", "summary"}:
350
+ for part in tag.text.split("\n"):
351
+ seg = part.strip()
352
+ if seg:
353
+ doc.add_text(
354
+ parent=self.parents[self.level],
355
+ label=DocItemLabel.TEXT,
356
+ text=seg,
357
+ content_layer=self.content_layer,
358
+ )
359
+ for img_tag in tag("img"):
360
+ if isinstance(img_tag, Tag):
361
+ self._emit_image(img_tag, doc)
362
+
363
+ elif tag_name == "table":
364
+ data = HTMLDocumentBackend.parse_table_data(tag)
365
+ for img_tag in tag("img"):
366
+ if isinstance(img_tag, Tag):
367
+ self._emit_image(tag, doc)
368
+ if data is not None:
369
+ doc.add_table(
370
+ data=data,
371
+ parent=self.parents[self.level],
372
+ content_layer=self.content_layer,
373
+ )
374
+
375
+ elif tag_name in {"pre", "code"}:
376
+ # handle monospace code snippets (pre).
377
+ text = tag.get_text(strip=True)
378
+ if text:
379
+ doc.add_code(
380
+ parent=self.parents[self.level],
381
+ text=text,
382
+ content_layer=self.content_layer,
383
+ )
384
+
385
+ elif tag_name == "details":
386
+ # handle details and its content.
387
+ self.parents[self.level + 1] = doc.add_group(
388
+ name="details",
389
+ label=GroupLabel.SECTION,
390
+ parent=self.parents[self.level],
391
+ content_layer=self.content_layer,
392
+ )
393
+ self.level += 1
394
+ self._walk(tag, doc)
395
+ self.parents[self.level + 1] = None
396
+ self.level -= 1
397
+
398
+ def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
399
+ figure = img_tag.find_parent("figure")
400
+ caption: str = ""
401
+ if isinstance(figure, Tag):
402
+ caption_tag = figure.find("figcaption", recursive=False)
403
+ if isinstance(caption_tag, Tag):
404
+ caption = caption_tag.get_text()
405
+ if not caption:
406
+ caption = str(img_tag.get("alt", "")).strip()
407
+
408
+ caption_item: Optional[TextItem] = None
409
+ if caption:
410
+ caption_item = doc.add_text(
411
+ DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
412
+ )
413
+
414
+ doc.add_picture(
415
+ caption=caption_item,
416
+ parent=self.parents[self.level],
417
+ content_layer=self.content_layer,
418
+ )
419
+
420
+ @staticmethod
421
+ def get_text(item: PageElement) -> str:
422
+ """Concatenate all child strings of a PageElement.
423
+
424
+ This method is equivalent to `PageElement.get_text()` but also considers
425
+ certain tags. When called on a <p> or <li> tags, it returns the text with a
426
+ trailing space, otherwise the text is concatenated without separators.
427
+ """
428
+
429
+ def _extract_text_recursively(item: PageElement) -> list[str]:
430
+ """Recursively extract text from all child nodes."""
431
+ result: list[str] = []
432
+
433
+ if isinstance(item, NavigableString):
434
+ result = [item]
435
+ elif isinstance(item, Tag):
436
+ tag = cast(Tag, item)
437
+ parts: list[str] = []
438
+ for child in tag:
439
+ parts.extend(_extract_text_recursively(child))
440
+ result.append(
441
+ "".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts)
442
+ )
443
+
444
+ return result
445
+
446
+ parts: list[str] = _extract_text_recursively(item)
447
+
448
+ return "".join(parts)
449
+
450
+ @staticmethod
451
+ def _get_cell_spans(cell: Tag) -> tuple[int, int]:
452
+ """Extract colspan and rowspan values from a table cell tag.
453
+
454
+ This function retrieves the 'colspan' and 'rowspan' attributes from a given
455
+ table cell tag.
456
+ If the attribute does not exist or it is not numeric, it defaults to 1.
457
+ """
458
+ raw_spans: tuple[str, str] = (
459
+ str(cell.get("colspan", "1")),
460
+ str(cell.get("rowspan", "1")),
461
+ )
462
+ int_spans: tuple[int, int] = (
463
+ int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
464
+ int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
465
+ )
466
+
467
+ return int_spans
468
+
469
+ @staticmethod
470
+ def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
471
+ nested_tables = element.find("table")
472
+ if nested_tables is not None:
473
+ _log.debug("Skipping nested table.")
474
+ return None
475
+
476
+ # Find the number of rows and columns (taking into account spans)
477
+ num_rows = 0
478
+ num_cols = 0
479
+ for row in element("tr"):
480
+ col_count = 0
481
+ is_row_header = True
482
+ if not isinstance(row, Tag):
483
+ continue
484
+ for cell in row(["td", "th"]):
485
+ if not isinstance(row, Tag):
486
+ continue
487
+ cell_tag = cast(Tag, cell)
488
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
489
+ col_count += col_span
490
+ if cell_tag.name == "td" or row_span == 1:
491
+ is_row_header = False
492
+ num_cols = max(num_cols, col_count)
493
+ if not is_row_header:
494
+ num_rows += 1
495
+
496
+ _log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
497
+
498
+ grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
499
+
500
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
501
+
502
+ # Iterate over the rows in the table
503
+ start_row_span = 0
504
+ row_idx = -1
505
+ for row in element("tr"):
506
+ if not isinstance(row, Tag):
507
+ continue
508
+
509
+ # For each row, find all the column cells (both <td> and <th>)
510
+ cells = row(["td", "th"])
511
+
512
+ # Check if cell is in a column header or row header
513
+ col_header = True
514
+ row_header = True
515
+ for html_cell in cells:
516
+ if isinstance(html_cell, Tag):
517
+ _, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
518
+ if html_cell.name == "td":
519
+ col_header = False
520
+ row_header = False
521
+ elif row_span == 1:
522
+ row_header = False
523
+ if not row_header:
524
+ row_idx += 1
525
+ start_row_span = 0
526
+ else:
527
+ start_row_span += 1
528
+
529
+ # Extract the text content of each cell
530
+ col_idx = 0
531
+ for html_cell in cells:
532
+ if not isinstance(html_cell, Tag):
533
+ continue
534
+
535
+ # extract inline formulas
536
+ for formula in html_cell("inline-formula"):
537
+ math_parts = formula.text.split("$$")
538
+ if len(math_parts) == 3:
539
+ math_formula = f"$${math_parts[1]}$$"
540
+ formula.replace_with(NavigableString(math_formula))
541
+
542
+ # TODO: extract content correctly from table-cells with lists
543
+ text = HTMLDocumentBackend.get_text(html_cell).strip()
544
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
545
+ if row_header:
546
+ row_span -= 1
547
+ while (
548
+ col_idx < num_cols
549
+ and grid[row_idx + start_row_span][col_idx] is not None
550
+ ):
551
+ col_idx += 1
552
+ for r in range(start_row_span, start_row_span + row_span):
553
+ for c in range(col_span):
554
+ if row_idx + r < num_rows and col_idx + c < num_cols:
555
+ grid[row_idx + r][col_idx + c] = text
556
+
557
+ table_cell = TableCell(
558
+ text=text,
559
+ row_span=row_span,
560
+ col_span=col_span,
561
+ start_row_offset_idx=start_row_span + row_idx,
562
+ end_row_offset_idx=start_row_span + row_idx + row_span,
563
+ start_col_offset_idx=col_idx,
564
+ end_col_offset_idx=col_idx + col_span,
565
+ column_header=col_header,
566
+ row_header=((not col_header) and html_cell.name == "th"),
567
+ )
568
+ data.table_cells.append(table_cell)
569
+
570
+ return data
@@ -1104,8 +1104,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1104
1104
  )
1105
1105
  _log.debug(f" spanned before row {spanned_idx}")
1106
1106
 
1107
+ # Detect equations in cell text
1108
+ text, equations = self._handle_equations_in_text(
1109
+ element=cell._element, text=cell.text
1110
+ )
1111
+ if len(equations) == 0:
1112
+ text = cell.text
1113
+ else:
1114
+ text = text.replace("<eq>", "$").replace("</eq>", "$")
1115
+
1107
1116
  table_cell = TableCell(
1108
- text=cell.text,
1117
+ text=text,
1109
1118
  row_span=spanned_idx - row_idx,
1110
1119
  col_span=cell.grid_span,
1111
1120
  start_row_offset_idx=row.grid_cols_before + row_idx,
@@ -57,7 +57,31 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
57
57
  if self.input_format is InputFormat.IMAGE:
58
58
  buf = BytesIO()
59
59
  img = Image.open(self.path_or_stream)
60
- img.save(buf, "PDF")
60
+
61
+ # Handle multi-page TIFF images
62
+ if hasattr(img, "n_frames") and img.n_frames > 1:
63
+ # Extract all frames from multi-page image
64
+ frames = []
65
+ try:
66
+ for i in range(img.n_frames):
67
+ img.seek(i)
68
+ frame = img.copy().convert("RGB")
69
+ frames.append(frame)
70
+ except EOFError:
71
+ pass
72
+
73
+ # Save as multi-page PDF
74
+ if frames:
75
+ frames[0].save(
76
+ buf, "PDF", save_all=True, append_images=frames[1:]
77
+ )
78
+ else:
79
+ # Fallback to single page if frame extraction fails
80
+ img.convert("RGB").save(buf, "PDF")
81
+ else:
82
+ # Single page image - convert to RGB and save
83
+ img.convert("RGB").save(buf, "PDF")
84
+
61
85
  buf.seek(0)
62
86
  self.path_or_stream = buf
63
87
  else:
@@ -217,7 +217,13 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
217
217
  return conv_res
218
218
 
219
219
  def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
220
- status = ConversionStatus.SUCCESS
220
+ status = conv_res.status
221
+ if status in [
222
+ ConversionStatus.PENDING,
223
+ ConversionStatus.STARTED,
224
+ ]: # preserves ConversionStatus.PARTIAL_SUCCESS
225
+ status = ConversionStatus.SUCCESS
226
+
221
227
  for page in conv_res.pages:
222
228
  if page._backend is None or not page._backend.is_valid():
223
229
  conv_res.errors.append(
@@ -267,9 +267,14 @@ class LayoutPostprocessor:
267
267
  # Initial cell assignment
268
268
  clusters = self._assign_cells_to_clusters(clusters)
269
269
 
270
- # Remove clusters with no cells (if keep_empty_clusters is False)
270
+ # Remove clusters with no cells (if keep_empty_clusters is False),
271
+ # but always keep clusters with label DocItemLabel.FORMULA
271
272
  if not self.options.keep_empty_clusters:
272
- clusters = [cluster for cluster in clusters if cluster.cells]
273
+ clusters = [
274
+ cluster
275
+ for cluster in clusters
276
+ if cluster.cells or cluster.label == DocItemLabel.FORMULA
277
+ ]
273
278
 
274
279
  # Handle orphaned cells
275
280
  unassigned = self._find_unassigned_cells(clusters)