docling 2.41.0__tar.gz → 2.42.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {docling-2.41.0 → docling-2.42.1}/PKG-INFO +2 -1
  2. {docling-2.41.0 → docling-2.42.1}/docling/backend/docx/latex/omml.py +9 -1
  3. docling-2.42.1/docling/backend/html_backend.py +542 -0
  4. {docling-2.41.0 → docling-2.42.1}/docling/backend/xml/jats_backend.py +12 -4
  5. {docling-2.41.0 → docling-2.42.1}/docling/datamodel/pipeline_options.py +4 -1
  6. {docling-2.41.0 → docling-2.42.1}/docling/document_converter.py +14 -11
  7. {docling-2.41.0 → docling-2.42.1}/docling/models/picture_description_vlm_model.py +2 -1
  8. {docling-2.41.0 → docling-2.42.1}/docling/utils/layout_postprocessor.py +8 -2
  9. {docling-2.41.0 → docling-2.42.1}/docling.egg-info/PKG-INFO +2 -1
  10. {docling-2.41.0 → docling-2.42.1}/docling.egg-info/requires.txt +1 -0
  11. {docling-2.41.0 → docling-2.42.1}/pyproject.toml +2 -1
  12. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_html.py +2 -6
  13. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_jats.py +14 -14
  14. docling-2.41.0/docling/backend/html_backend.py +0 -577
  15. {docling-2.41.0 → docling-2.42.1}/LICENSE +0 -0
  16. {docling-2.41.0 → docling-2.42.1}/README.md +0 -0
  17. {docling-2.41.0 → docling-2.42.1}/docling/__init__.py +0 -0
  18. {docling-2.41.0 → docling-2.42.1}/docling/backend/__init__.py +0 -0
  19. {docling-2.41.0 → docling-2.42.1}/docling/backend/abstract_backend.py +0 -0
  20. {docling-2.41.0 → docling-2.42.1}/docling/backend/asciidoc_backend.py +0 -0
  21. {docling-2.41.0 → docling-2.42.1}/docling/backend/csv_backend.py +0 -0
  22. {docling-2.41.0 → docling-2.42.1}/docling/backend/docling_parse_backend.py +0 -0
  23. {docling-2.41.0 → docling-2.42.1}/docling/backend/docling_parse_v2_backend.py +0 -0
  24. {docling-2.41.0 → docling-2.42.1}/docling/backend/docling_parse_v4_backend.py +0 -0
  25. {docling-2.41.0 → docling-2.42.1}/docling/backend/docx/__init__.py +0 -0
  26. {docling-2.41.0 → docling-2.42.1}/docling/backend/docx/latex/__init__.py +0 -0
  27. {docling-2.41.0 → docling-2.42.1}/docling/backend/docx/latex/latex_dict.py +0 -0
  28. {docling-2.41.0 → docling-2.42.1}/docling/backend/json/__init__.py +0 -0
  29. {docling-2.41.0 → docling-2.42.1}/docling/backend/json/docling_json_backend.py +0 -0
  30. {docling-2.41.0 → docling-2.42.1}/docling/backend/md_backend.py +0 -0
  31. {docling-2.41.0 → docling-2.42.1}/docling/backend/msexcel_backend.py +0 -0
  32. {docling-2.41.0 → docling-2.42.1}/docling/backend/mspowerpoint_backend.py +0 -0
  33. {docling-2.41.0 → docling-2.42.1}/docling/backend/msword_backend.py +0 -0
  34. {docling-2.41.0 → docling-2.42.1}/docling/backend/noop_backend.py +0 -0
  35. {docling-2.41.0 → docling-2.42.1}/docling/backend/pdf_backend.py +0 -0
  36. {docling-2.41.0 → docling-2.42.1}/docling/backend/pypdfium2_backend.py +0 -0
  37. {docling-2.41.0 → docling-2.42.1}/docling/backend/xml/__init__.py +0 -0
  38. {docling-2.41.0 → docling-2.42.1}/docling/backend/xml/uspto_backend.py +0 -0
  39. {docling-2.41.0 → docling-2.42.1}/docling/chunking/__init__.py +0 -0
  40. {docling-2.41.0 → docling-2.42.1}/docling/cli/__init__.py +0 -0
  41. {docling-2.41.0 → docling-2.42.1}/docling/cli/main.py +0 -0
  42. {docling-2.41.0 → docling-2.42.1}/docling/cli/models.py +0 -0
  43. {docling-2.41.0 → docling-2.42.1}/docling/cli/tools.py +0 -0
  44. {docling-2.41.0 → docling-2.42.1}/docling/datamodel/__init__.py +0 -0
  45. {docling-2.41.0 → docling-2.42.1}/docling/datamodel/accelerator_options.py +0 -0
  46. {docling-2.41.0 → docling-2.42.1}/docling/datamodel/asr_model_specs.py +0 -0
  47. {docling-2.41.0 → docling-2.42.1}/docling/datamodel/base_models.py +0 -0
  48. {docling-2.41.0 → docling-2.42.1}/docling/datamodel/document.py +0 -0
  49. {docling-2.41.0 → docling-2.42.1}/docling/datamodel/layout_model_specs.py +0 -0
  50. {docling-2.41.0 → docling-2.42.1}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  51. {docling-2.41.0 → docling-2.42.1}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  52. {docling-2.41.0 → docling-2.42.1}/docling/datamodel/settings.py +0 -0
  53. {docling-2.41.0 → docling-2.42.1}/docling/datamodel/vlm_model_specs.py +0 -0
  54. {docling-2.41.0 → docling-2.42.1}/docling/exceptions.py +0 -0
  55. {docling-2.41.0 → docling-2.42.1}/docling/models/__init__.py +0 -0
  56. {docling-2.41.0 → docling-2.42.1}/docling/models/api_vlm_model.py +0 -0
  57. {docling-2.41.0 → docling-2.42.1}/docling/models/base_model.py +0 -0
  58. {docling-2.41.0 → docling-2.42.1}/docling/models/base_ocr_model.py +0 -0
  59. {docling-2.41.0 → docling-2.42.1}/docling/models/code_formula_model.py +0 -0
  60. {docling-2.41.0 → docling-2.42.1}/docling/models/document_picture_classifier.py +0 -0
  61. {docling-2.41.0 → docling-2.42.1}/docling/models/easyocr_model.py +0 -0
  62. {docling-2.41.0 → docling-2.42.1}/docling/models/factories/__init__.py +0 -0
  63. {docling-2.41.0 → docling-2.42.1}/docling/models/factories/base_factory.py +0 -0
  64. {docling-2.41.0 → docling-2.42.1}/docling/models/factories/ocr_factory.py +0 -0
  65. {docling-2.41.0 → docling-2.42.1}/docling/models/factories/picture_description_factory.py +0 -0
  66. {docling-2.41.0 → docling-2.42.1}/docling/models/layout_model.py +0 -0
  67. {docling-2.41.0 → docling-2.42.1}/docling/models/ocr_mac_model.py +0 -0
  68. {docling-2.41.0 → docling-2.42.1}/docling/models/page_assemble_model.py +0 -0
  69. {docling-2.41.0 → docling-2.42.1}/docling/models/page_preprocessing_model.py +0 -0
  70. {docling-2.41.0 → docling-2.42.1}/docling/models/picture_description_api_model.py +0 -0
  71. {docling-2.41.0 → docling-2.42.1}/docling/models/picture_description_base_model.py +0 -0
  72. {docling-2.41.0 → docling-2.42.1}/docling/models/plugins/__init__.py +0 -0
  73. {docling-2.41.0 → docling-2.42.1}/docling/models/plugins/defaults.py +0 -0
  74. {docling-2.41.0 → docling-2.42.1}/docling/models/rapid_ocr_model.py +0 -0
  75. {docling-2.41.0 → docling-2.42.1}/docling/models/readingorder_model.py +0 -0
  76. {docling-2.41.0 → docling-2.42.1}/docling/models/table_structure_model.py +0 -0
  77. {docling-2.41.0 → docling-2.42.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
  78. {docling-2.41.0 → docling-2.42.1}/docling/models/tesseract_ocr_model.py +0 -0
  79. {docling-2.41.0 → docling-2.42.1}/docling/models/utils/__init__.py +0 -0
  80. {docling-2.41.0 → docling-2.42.1}/docling/models/utils/hf_model_download.py +0 -0
  81. {docling-2.41.0 → docling-2.42.1}/docling/models/vlm_models_inline/__init__.py +0 -0
  82. {docling-2.41.0 → docling-2.42.1}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  83. {docling-2.41.0 → docling-2.42.1}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  84. {docling-2.41.0 → docling-2.42.1}/docling/pipeline/__init__.py +0 -0
  85. {docling-2.41.0 → docling-2.42.1}/docling/pipeline/asr_pipeline.py +0 -0
  86. {docling-2.41.0 → docling-2.42.1}/docling/pipeline/base_pipeline.py +0 -0
  87. {docling-2.41.0 → docling-2.42.1}/docling/pipeline/simple_pipeline.py +0 -0
  88. {docling-2.41.0 → docling-2.42.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  89. {docling-2.41.0 → docling-2.42.1}/docling/pipeline/vlm_pipeline.py +0 -0
  90. {docling-2.41.0 → docling-2.42.1}/docling/py.typed +0 -0
  91. {docling-2.41.0 → docling-2.42.1}/docling/utils/__init__.py +0 -0
  92. {docling-2.41.0 → docling-2.42.1}/docling/utils/accelerator_utils.py +0 -0
  93. {docling-2.41.0 → docling-2.42.1}/docling/utils/api_image_request.py +0 -0
  94. {docling-2.41.0 → docling-2.42.1}/docling/utils/export.py +0 -0
  95. {docling-2.41.0 → docling-2.42.1}/docling/utils/glm_utils.py +0 -0
  96. {docling-2.41.0 → docling-2.42.1}/docling/utils/locks.py +0 -0
  97. {docling-2.41.0 → docling-2.42.1}/docling/utils/model_downloader.py +0 -0
  98. {docling-2.41.0 → docling-2.42.1}/docling/utils/ocr_utils.py +0 -0
  99. {docling-2.41.0 → docling-2.42.1}/docling/utils/orientation.py +0 -0
  100. {docling-2.41.0 → docling-2.42.1}/docling/utils/profiling.py +0 -0
  101. {docling-2.41.0 → docling-2.42.1}/docling/utils/utils.py +0 -0
  102. {docling-2.41.0 → docling-2.42.1}/docling/utils/visualization.py +0 -0
  103. {docling-2.41.0 → docling-2.42.1}/docling.egg-info/SOURCES.txt +0 -0
  104. {docling-2.41.0 → docling-2.42.1}/docling.egg-info/dependency_links.txt +0 -0
  105. {docling-2.41.0 → docling-2.42.1}/docling.egg-info/entry_points.txt +0 -0
  106. {docling-2.41.0 → docling-2.42.1}/docling.egg-info/top_level.txt +0 -0
  107. {docling-2.41.0 → docling-2.42.1}/setup.cfg +0 -0
  108. {docling-2.41.0 → docling-2.42.1}/tests/test_asr_pipeline.py +0 -0
  109. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_asciidoc.py +0 -0
  110. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_csv.py +0 -0
  111. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_docling_json.py +0 -0
  112. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_docling_parse.py +0 -0
  113. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_docling_parse_v2.py +0 -0
  114. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_docling_parse_v4.py +0 -0
  115. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_markdown.py +0 -0
  116. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_msexcel.py +0 -0
  117. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_msword.py +0 -0
  118. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_patent_uspto.py +0 -0
  119. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_pdfium.py +0 -0
  120. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_pptx.py +0 -0
  121. {docling-2.41.0 → docling-2.42.1}/tests/test_backend_webp.py +0 -0
  122. {docling-2.41.0 → docling-2.42.1}/tests/test_cli.py +0 -0
  123. {docling-2.41.0 → docling-2.42.1}/tests/test_code_formula.py +0 -0
  124. {docling-2.41.0 → docling-2.42.1}/tests/test_data_gen_flag.py +0 -0
  125. {docling-2.41.0 → docling-2.42.1}/tests/test_document_picture_classifier.py +0 -0
  126. {docling-2.41.0 → docling-2.42.1}/tests/test_e2e_conversion.py +0 -0
  127. {docling-2.41.0 → docling-2.42.1}/tests/test_e2e_ocr_conversion.py +0 -0
  128. {docling-2.41.0 → docling-2.42.1}/tests/test_input_doc.py +0 -0
  129. {docling-2.41.0 → docling-2.42.1}/tests/test_interfaces.py +0 -0
  130. {docling-2.41.0 → docling-2.42.1}/tests/test_invalid_input.py +0 -0
  131. {docling-2.41.0 → docling-2.42.1}/tests/test_legacy_format_transform.py +0 -0
  132. {docling-2.41.0 → docling-2.42.1}/tests/test_ocr_utils.py +0 -0
  133. {docling-2.41.0 → docling-2.42.1}/tests/test_options.py +0 -0
  134. {docling-2.41.0 → docling-2.42.1}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.41.0
3
+ Version: 2.42.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -50,6 +50,7 @@ Requires-Dist: tqdm<5.0.0,>=4.65.0
50
50
  Requires-Dist: pluggy<2.0.0,>=1.0.0
51
51
  Requires-Dist: pylatexenc<3.0,>=2.10
52
52
  Requires-Dist: scipy<2.0.0,>=1.6.0
53
+ Requires-Dist: accelerate<2,>=1.0.0
53
54
  Provides-Extra: tesserocr
54
55
  Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
55
56
  Provides-Extra: ocrmac
@@ -260,7 +260,15 @@ class oMath2Latex(Tag2Method):
260
260
  the fraction object
261
261
  """
262
262
  c_dict = self.process_children_dict(elm)
263
- pr = c_dict["fPr"]
263
+ pr = c_dict.get("fPr")
264
+ if pr is None:
265
+ # Handle missing fPr element gracefully
266
+ _log.debug("Missing fPr element in fraction, using default formatting")
267
+ latex_s = F_DEFAULT
268
+ return latex_s.format(
269
+ num=c_dict.get("num"),
270
+ den=c_dict.get("den"),
271
+ )
264
272
  latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
265
273
  return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
266
274
 
@@ -0,0 +1,542 @@
1
+ import logging
2
+ import re
3
+ import traceback
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from typing import Final, Optional, Union, cast
7
+
8
+ from bs4 import BeautifulSoup, NavigableString, Tag
9
+ from bs4.element import PreformattedString
10
+ from docling_core.types.doc import (
11
+ DocItem,
12
+ DocItemLabel,
13
+ DoclingDocument,
14
+ DocumentOrigin,
15
+ GroupItem,
16
+ GroupLabel,
17
+ TableCell,
18
+ TableData,
19
+ TextItem,
20
+ )
21
+ from docling_core.types.doc.document import ContentLayer
22
+ from pydantic import BaseModel
23
+ from typing_extensions import override
24
+
25
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
26
+ from docling.datamodel.base_models import InputFormat
27
+ from docling.datamodel.document import InputDocument
28
+
29
+ _log = logging.getLogger(__name__)
30
+
31
+ DEFAULT_IMAGE_WIDTH = 128
32
+ DEFAULT_IMAGE_HEIGHT = 128
33
+
34
+ # Tags that initiate distinct Docling items
35
+ _BLOCK_TAGS: Final = {
36
+ "address",
37
+ "details",
38
+ "figure",
39
+ "h1",
40
+ "h2",
41
+ "h3",
42
+ "h4",
43
+ "h5",
44
+ "h6",
45
+ "p",
46
+ "pre",
47
+ "code",
48
+ "ul",
49
+ "ol",
50
+ "summary",
51
+ "table",
52
+ }
53
+
54
+
55
+ class _Context(BaseModel):
56
+ list_ordered_flag_by_ref: dict[str, bool] = {}
57
+ list_start_by_ref: dict[str, int] = {}
58
+
59
+
60
+ class HTMLDocumentBackend(DeclarativeDocumentBackend):
61
+ @override
62
+ def __init__(
63
+ self,
64
+ in_doc: InputDocument,
65
+ path_or_stream: Union[BytesIO, Path],
66
+ ):
67
+ super().__init__(in_doc, path_or_stream)
68
+ self.soup: Optional[Tag] = None
69
+ self.path_or_stream = path_or_stream
70
+
71
+ # Initialize the parents for the hierarchy
72
+ self.max_levels = 10
73
+ self.level = 0
74
+ self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
75
+ self.ctx = _Context()
76
+ for i in range(self.max_levels):
77
+ self.parents[i] = None
78
+
79
+ try:
80
+ raw = (
81
+ path_or_stream.getvalue()
82
+ if isinstance(path_or_stream, BytesIO)
83
+ else Path(path_or_stream).read_bytes()
84
+ )
85
+ self.soup = BeautifulSoup(raw, "html.parser")
86
+ except Exception as e:
87
+ raise RuntimeError(
88
+ "Could not initialize HTML backend for file with "
89
+ f"hash {self.document_hash}."
90
+ ) from e
91
+
92
+ @override
93
+ def is_valid(self) -> bool:
94
+ return self.soup is not None
95
+
96
+ @classmethod
97
+ @override
98
+ def supports_pagination(cls) -> bool:
99
+ return False
100
+
101
+ @override
102
+ def unload(self):
103
+ if isinstance(self.path_or_stream, BytesIO):
104
+ self.path_or_stream.close()
105
+ self.path_or_stream = None
106
+
107
+ @classmethod
108
+ @override
109
+ def supported_formats(cls) -> set[InputFormat]:
110
+ return {InputFormat.HTML}
111
+
112
+ @override
113
+ def convert(self) -> DoclingDocument:
114
+ _log.debug("Starting HTML conversion...")
115
+ if not self.is_valid():
116
+ raise RuntimeError("Invalid HTML document.")
117
+
118
+ origin = DocumentOrigin(
119
+ filename=self.file.name or "file",
120
+ mimetype="text/html",
121
+ binary_hash=self.document_hash,
122
+ )
123
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
124
+
125
+ assert self.soup is not None
126
+ # set the title as furniture, since it is part of the document metadata
127
+ title = self.soup.title
128
+ if title:
129
+ doc.add_title(
130
+ text=title.get_text(separator=" ", strip=True),
131
+ content_layer=ContentLayer.FURNITURE,
132
+ )
133
+ # remove scripts/styles
134
+ for tag in self.soup(["script", "style"]):
135
+ tag.decompose()
136
+ content = self.soup.body or self.soup
137
+ # normalize <br> tags
138
+ for br in content("br"):
139
+ br.replace_with(NavigableString("\n"))
140
+ # set default content layer
141
+ headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
142
+ self.content_layer = (
143
+ ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
144
+ )
145
+ # reset context
146
+ self.ctx = _Context()
147
+
148
+ try:
149
+ self._walk(content, doc)
150
+ except Exception:
151
+ print(traceback.format_exc())
152
+
153
+ return doc
154
+
155
+ def _walk(self, element: Tag, doc: DoclingDocument) -> None:
156
+ """Parse an XML tag by recursively walking its content.
157
+
158
+ While walking, the method buffers inline text across tags like <b> or <span>,
159
+ emitting text nodes only at block boundaries.
160
+
161
+ Args:
162
+ element: The XML tag to parse.
163
+ doc: The Docling document to be updated with the parsed content.
164
+ """
165
+ buffer: list[str] = []
166
+
167
+ def flush_buffer():
168
+ if not buffer:
169
+ return
170
+ text = "".join(buffer).strip()
171
+ buffer.clear()
172
+ if not text:
173
+ return
174
+ for part in text.split("\n"):
175
+ seg = part.strip()
176
+ if seg:
177
+ doc.add_text(
178
+ DocItemLabel.TEXT,
179
+ seg,
180
+ parent=self.parents[self.level],
181
+ content_layer=self.content_layer,
182
+ )
183
+
184
+ for node in element.contents:
185
+ if isinstance(node, Tag):
186
+ name = node.name.lower()
187
+ if name == "img":
188
+ flush_buffer()
189
+ self._emit_image(node, doc)
190
+ elif name in _BLOCK_TAGS:
191
+ flush_buffer()
192
+ self._handle_block(node, doc)
193
+ elif node.find(_BLOCK_TAGS):
194
+ flush_buffer()
195
+ self._walk(node, doc)
196
+ else:
197
+ buffer.append(node.text)
198
+ elif isinstance(node, NavigableString) and not isinstance(
199
+ node, PreformattedString
200
+ ):
201
+ buffer.append(str(node))
202
+
203
+ flush_buffer()
204
+
205
+ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
206
+ tag_name = tag.name.lower()
207
+ # set default content layer to BODY as soon as we encounter a heading
208
+ self.content_layer = ContentLayer.BODY
209
+ level = int(tag_name[1])
210
+ text = tag.get_text(strip=True, separator=" ")
211
+ # the first level is for the title item
212
+ if level == 1:
213
+ for key in self.parents.keys():
214
+ self.parents[key] = None
215
+ self.level = 0
216
+ self.parents[self.level + 1] = doc.add_title(
217
+ text, content_layer=self.content_layer
218
+ )
219
+ # the other levels need to be lowered by 1 if a title was set
220
+ else:
221
+ level -= 1
222
+ if level > self.level:
223
+ # add invisible group
224
+ for i in range(self.level, level):
225
+ _log.debug(f"Adding invisible group to level {i}")
226
+ self.parents[i + 1] = doc.add_group(
227
+ name=f"header-{i + 1}",
228
+ label=GroupLabel.SECTION,
229
+ parent=self.parents[i],
230
+ content_layer=self.content_layer,
231
+ )
232
+ self.level = level
233
+ elif level < self.level:
234
+ # remove the tail
235
+ for key in self.parents.keys():
236
+ if key > level + 1:
237
+ _log.debug(f"Remove the tail of level {key}")
238
+ self.parents[key] = None
239
+ self.level = level
240
+ self.parents[self.level + 1] = doc.add_heading(
241
+ parent=self.parents[self.level],
242
+ text=text,
243
+ level=self.level,
244
+ content_layer=self.content_layer,
245
+ )
246
+ self.level += 1
247
+ for img_tag in tag("img"):
248
+ if isinstance(img_tag, Tag):
249
+ self._emit_image(img_tag, doc)
250
+
251
+ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> None:
252
+ tag_name = tag.name.lower()
253
+ start: Optional[int] = None
254
+ name: str = ""
255
+ is_ordered = tag_name == "ol"
256
+ if is_ordered:
257
+ start_attr = tag.get("start")
258
+ if isinstance(start_attr, str) and start_attr.isnumeric():
259
+ start = int(start_attr)
260
+ name = "ordered list" + (f" start {start}" if start is not None else "")
261
+ else:
262
+ name = "list"
263
+ # Create the list container
264
+ list_group = doc.add_list_group(
265
+ name=name,
266
+ parent=self.parents[self.level],
267
+ content_layer=self.content_layer,
268
+ )
269
+ self.parents[self.level + 1] = list_group
270
+ self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
271
+ if is_ordered and start is not None:
272
+ self.ctx.list_start_by_ref[list_group.self_ref] = start
273
+ self.level += 1
274
+
275
+ # For each top-level <li> in this list
276
+ for li in tag.find_all({"li", "ul", "ol"}, recursive=False):
277
+ if not isinstance(li, Tag):
278
+ continue
279
+
280
+ # sub-list items should be indented under main list items, but temporarily
281
+ # addressing invalid HTML (docling-core/issues/357)
282
+ if li.name in {"ul", "ol"}:
283
+ self._handle_block(li, doc)
284
+
285
+ else:
286
+ # 1) determine the marker
287
+ if is_ordered and start is not None:
288
+ marker = f"{start + len(list_group.children)}."
289
+ else:
290
+ marker = ""
291
+
292
+ # 2) extract only the "direct" text from this <li>
293
+ parts: list[str] = []
294
+ for child in li.contents:
295
+ if isinstance(child, NavigableString) and not isinstance(
296
+ child, PreformattedString
297
+ ):
298
+ parts.append(child)
299
+ elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
300
+ text_part = child.get_text()
301
+ if text_part:
302
+ parts.append(text_part)
303
+ li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
304
+
305
+ # 3) add the list item
306
+ if li_text:
307
+ self.parents[self.level + 1] = doc.add_list_item(
308
+ text=li_text,
309
+ enumerated=is_ordered,
310
+ marker=marker,
311
+ parent=list_group,
312
+ content_layer=self.content_layer,
313
+ )
314
+
315
+ # 4) recurse into any nested lists, attaching them to this <li> item
316
+ for sublist in li({"ul", "ol"}, recursive=False):
317
+ if isinstance(sublist, Tag):
318
+ self.level += 1
319
+ self._handle_block(sublist, doc)
320
+ self.parents[self.level + 1] = None
321
+ self.level -= 1
322
+ else:
323
+ for sublist in li({"ul", "ol"}, recursive=False):
324
+ if isinstance(sublist, Tag):
325
+ self._handle_block(sublist, doc)
326
+
327
+ # 5) extract any images under this <li>
328
+ for img_tag in li("img"):
329
+ if isinstance(img_tag, Tag):
330
+ self._emit_image(img_tag, doc)
331
+
332
+ self.parents[self.level + 1] = None
333
+ self.level -= 1
334
+
335
+ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> None:
336
+ tag_name = tag.name.lower()
337
+
338
+ if tag_name == "figure":
339
+ img_tag = tag.find("img")
340
+ if isinstance(img_tag, Tag):
341
+ self._emit_image(img_tag, doc)
342
+
343
+ elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
344
+ self._handle_heading(tag, doc)
345
+
346
+ elif tag_name in {"ul", "ol"}:
347
+ self._handle_list(tag, doc)
348
+
349
+ elif tag_name in {"p", "address", "summary"}:
350
+ for part in tag.text.split("\n"):
351
+ seg = part.strip()
352
+ if seg:
353
+ doc.add_text(
354
+ parent=self.parents[self.level],
355
+ label=DocItemLabel.TEXT,
356
+ text=seg,
357
+ content_layer=self.content_layer,
358
+ )
359
+ for img_tag in tag("img"):
360
+ if isinstance(img_tag, Tag):
361
+ self._emit_image(img_tag, doc)
362
+
363
+ elif tag_name == "table":
364
+ data = HTMLDocumentBackend.parse_table_data(tag)
365
+ for img_tag in tag("img"):
366
+ if isinstance(img_tag, Tag):
367
+ self._emit_image(tag, doc)
368
+ if data is not None:
369
+ doc.add_table(
370
+ data=data,
371
+ parent=self.parents[self.level],
372
+ content_layer=self.content_layer,
373
+ )
374
+
375
+ elif tag_name in {"pre", "code"}:
376
+ # handle monospace code snippets (pre).
377
+ text = tag.get_text(strip=True)
378
+ if text:
379
+ doc.add_code(
380
+ parent=self.parents[self.level],
381
+ text=text,
382
+ content_layer=self.content_layer,
383
+ )
384
+
385
+ elif tag_name == "details":
386
+ # handle details and its content.
387
+ self.parents[self.level + 1] = doc.add_group(
388
+ name="details",
389
+ label=GroupLabel.SECTION,
390
+ parent=self.parents[self.level],
391
+ content_layer=self.content_layer,
392
+ )
393
+ self.level += 1
394
+ self._walk(tag, doc)
395
+ self.parents[self.level + 1] = None
396
+ self.level -= 1
397
+
398
+ def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
399
+ figure = img_tag.find_parent("figure")
400
+ caption: str = ""
401
+ if isinstance(figure, Tag):
402
+ caption_tag = figure.find("figcaption", recursive=False)
403
+ if isinstance(caption_tag, Tag):
404
+ caption = caption_tag.get_text()
405
+ if not caption:
406
+ caption = str(img_tag.get("alt", "")).strip()
407
+
408
+ caption_item: Optional[TextItem] = None
409
+ if caption:
410
+ caption_item = doc.add_text(
411
+ DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
412
+ )
413
+
414
+ doc.add_picture(
415
+ caption=caption_item,
416
+ parent=self.parents[self.level],
417
+ content_layer=self.content_layer,
418
+ )
419
+
420
+ @staticmethod
421
+ def _get_cell_spans(cell: Tag) -> tuple[int, int]:
422
+ """Extract colspan and rowspan values from a table cell tag.
423
+
424
+ This function retrieves the 'colspan' and 'rowspan' attributes from a given
425
+ table cell tag.
426
+ If the attribute does not exist or it is not numeric, it defaults to 1.
427
+ """
428
+ raw_spans: tuple[str, str] = (
429
+ str(cell.get("colspan", "1")),
430
+ str(cell.get("rowspan", "1")),
431
+ )
432
+ int_spans: tuple[int, int] = (
433
+ int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
434
+ int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
435
+ )
436
+
437
+ return int_spans
438
+
439
+ @staticmethod
440
+ def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
441
+ nested_tables = element.find("table")
442
+ if nested_tables is not None:
443
+ _log.debug("Skipping nested table.")
444
+ return None
445
+
446
+ # Find the number of rows and columns (taking into account spans)
447
+ num_rows = 0
448
+ num_cols = 0
449
+ for row in element("tr"):
450
+ col_count = 0
451
+ is_row_header = True
452
+ if not isinstance(row, Tag):
453
+ continue
454
+ for cell in row(["td", "th"]):
455
+ if not isinstance(row, Tag):
456
+ continue
457
+ cell_tag = cast(Tag, cell)
458
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
459
+ col_count += col_span
460
+ if cell_tag.name == "td" or row_span == 1:
461
+ is_row_header = False
462
+ num_cols = max(num_cols, col_count)
463
+ if not is_row_header:
464
+ num_rows += 1
465
+
466
+ _log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
467
+
468
+ grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
469
+
470
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
471
+
472
+ # Iterate over the rows in the table
473
+ start_row_span = 0
474
+ row_idx = -1
475
+ for row in element("tr"):
476
+ if not isinstance(row, Tag):
477
+ continue
478
+
479
+ # For each row, find all the column cells (both <td> and <th>)
480
+ cells = row(["td", "th"])
481
+
482
+ # Check if cell is in a column header or row header
483
+ col_header = True
484
+ row_header = True
485
+ for html_cell in cells:
486
+ if isinstance(html_cell, Tag):
487
+ _, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
488
+ if html_cell.name == "td":
489
+ col_header = False
490
+ row_header = False
491
+ elif row_span == 1:
492
+ row_header = False
493
+ if not row_header:
494
+ row_idx += 1
495
+ start_row_span = 0
496
+ else:
497
+ start_row_span += 1
498
+
499
+ # Extract the text content of each cell
500
+ col_idx = 0
501
+ for html_cell in cells:
502
+ if not isinstance(html_cell, Tag):
503
+ continue
504
+
505
+ # extract inline formulas
506
+ for formula in html_cell("inline-formula"):
507
+ math_parts = formula.text.split("$$")
508
+ if len(math_parts) == 3:
509
+ math_formula = f"$${math_parts[1]}$$"
510
+ formula.replace_with(NavigableString(math_formula))
511
+
512
+ # TODO: extract content correctly from table-cells with lists
513
+ text = html_cell.text
514
+
515
+ # label = html_cell.name
516
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
517
+ if row_header:
518
+ row_span -= 1
519
+ while (
520
+ col_idx < num_cols
521
+ and grid[row_idx + start_row_span][col_idx] is not None
522
+ ):
523
+ col_idx += 1
524
+ for r in range(start_row_span, start_row_span + row_span):
525
+ for c in range(col_span):
526
+ if row_idx + r < num_rows and col_idx + c < num_cols:
527
+ grid[row_idx + r][col_idx + c] = text
528
+
529
+ table_cell = TableCell(
530
+ text=text,
531
+ row_span=row_span,
532
+ col_span=col_span,
533
+ start_row_offset_idx=start_row_span + row_idx,
534
+ end_row_offset_idx=start_row_span + row_idx + row_span,
535
+ start_col_offset_idx=col_idx,
536
+ end_col_offset_idx=col_idx + col_span,
537
+ column_header=col_header,
538
+ row_header=((not col_header) and html_cell.name == "th"),
539
+ )
540
+ data.table_cells.append(table_cell)
541
+
542
+ return data
@@ -93,8 +93,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
93
93
 
94
94
  # Initialize the root of the document hierarchy
95
95
  self.root: Optional[NodeItem] = None
96
-
97
- self.valid = False
96
+ self.hlevel: int = 0
97
+ self.valid: bool = False
98
98
  try:
99
99
  if isinstance(self.path_or_stream, BytesIO):
100
100
  self.path_or_stream.seek(0)
@@ -147,6 +147,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
147
147
  binary_hash=self.document_hash,
148
148
  )
149
149
  doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
150
+ self.hlevel = 0
150
151
 
151
152
  # Get metadata XML components
152
153
  xml_components: XMLComponents = self._parse_metadata()
@@ -304,7 +305,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
304
305
  title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
305
306
  if not text:
306
307
  continue
307
- parent = doc.add_heading(parent=self.root, text=title)
308
+ parent = doc.add_heading(
309
+ parent=self.root, text=title, level=self.hlevel + 1
310
+ )
308
311
  doc.add_text(
309
312
  parent=parent,
310
313
  text=text,
@@ -637,7 +640,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
637
640
  elif child.tag == "ack":
638
641
  text = DEFAULT_HEADER_ACKNOWLEDGMENTS
639
642
  if text:
640
- new_parent = doc.add_heading(text=text, parent=parent)
643
+ self.hlevel += 1
644
+ new_parent = doc.add_heading(
645
+ text=text, parent=parent, level=self.hlevel
646
+ )
641
647
  elif child.tag == "list":
642
648
  new_parent = doc.add_group(
643
649
  label=GroupLabel.LIST, name="list", parent=parent
@@ -694,6 +700,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
694
700
  new_text = self._walk_linear(doc, new_parent, child)
695
701
  if not (node.getparent().tag == "p" and node.tag in flush_tags):
696
702
  node_text += new_text
703
+ if child.tag in ("sec", "ack") and text:
704
+ self.hlevel -= 1
697
705
 
698
706
  # pick up the tail text
699
707
  node_text += child.tail.replace("\n", " ") if child.tail else ""
@@ -217,7 +217,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
217
217
 
218
218
  # GraniteVision
219
219
  granite_picture_description = PictureDescriptionVlmOptions(
220
- repo_id="ibm-granite/granite-vision-3.2-2b-preview",
220
+ repo_id="ibm-granite/granite-vision-3.3-2b",
221
221
  prompt="What is shown in this image?",
222
222
  )
223
223
 
@@ -279,6 +279,9 @@ class LayoutOptions(BaseModel):
279
279
  """Options for layout processing."""
280
280
 
281
281
  create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
282
+ keep_empty_clusters: bool = (
283
+ False # Whether to keep clusters that contain no text cells
284
+ )
282
285
  model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
283
286
 
284
287
 
@@ -1,6 +1,7 @@
1
1
  import hashlib
2
2
  import logging
3
3
  import sys
4
+ import threading
4
5
  import time
5
6
  from collections.abc import Iterable, Iterator
6
7
  from functools import partial
@@ -49,6 +50,7 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
49
50
  from docling.utils.utils import chunkify
50
51
 
51
52
  _log = logging.getLogger(__name__)
53
+ _PIPELINE_CACHE_LOCK = threading.Lock()
52
54
 
53
55
 
54
56
  class FormatOption(BaseModel):
@@ -315,17 +317,18 @@ class DocumentConverter:
315
317
  # Use a composite key to cache pipelines
316
318
  cache_key = (pipeline_class, options_hash)
317
319
 
318
- if cache_key not in self.initialized_pipelines:
319
- _log.info(
320
- f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
321
- )
322
- self.initialized_pipelines[cache_key] = pipeline_class(
323
- pipeline_options=pipeline_options
324
- )
325
- else:
326
- _log.debug(
327
- f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
328
- )
320
+ with _PIPELINE_CACHE_LOCK:
321
+ if cache_key not in self.initialized_pipelines:
322
+ _log.info(
323
+ f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
324
+ )
325
+ self.initialized_pipelines[cache_key] = pipeline_class(
326
+ pipeline_options=pipeline_options
327
+ )
328
+ else:
329
+ _log.debug(
330
+ f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
331
+ )
329
332
 
330
333
  return self.initialized_pipelines[cache_key]
331
334