docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1502 @@
1
+ import base64
2
+ import logging
3
+ import os
4
+ import re
5
+ import warnings
6
+ from contextlib import contextmanager
7
+ from copy import deepcopy
8
+ from io import BytesIO
9
+ from pathlib import Path
10
+ from typing import Final, Optional, Union, cast
11
+ from urllib.parse import urljoin, urlparse
12
+
13
+ import requests
14
+ from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
15
+ from bs4.element import PreformattedString
16
+ from docling_core.types.doc import (
17
+ DocItem,
18
+ DocItemLabel,
19
+ DoclingDocument,
20
+ DocumentOrigin,
21
+ GroupItem,
22
+ GroupLabel,
23
+ PictureItem,
24
+ RefItem,
25
+ RichTableCell,
26
+ TableCell,
27
+ TableData,
28
+ TableItem,
29
+ TextItem,
30
+ )
31
+ from docling_core.types.doc.document import ContentLayer, Formatting, ImageRef, Script
32
+ from PIL import Image, UnidentifiedImageError
33
+ from pydantic import AnyUrl, BaseModel, ValidationError
34
+ from typing_extensions import override
35
+
36
+ from docling.backend.abstract_backend import (
37
+ DeclarativeDocumentBackend,
38
+ )
39
+ from docling.datamodel.backend_options import HTMLBackendOptions
40
+ from docling.datamodel.base_models import InputFormat
41
+ from docling.datamodel.document import InputDocument
42
+ from docling.exceptions import OperationNotAllowed
43
+
44
+ _log = logging.getLogger(__name__)
45
+
46
+ DEFAULT_IMAGE_WIDTH = 128
47
+ DEFAULT_IMAGE_HEIGHT = 128
48
+
49
+ # Tags that initiate distinct Docling items
50
+ _BLOCK_TAGS: Final = {
51
+ "address",
52
+ "details",
53
+ "figure",
54
+ "footer",
55
+ "img",
56
+ "h1",
57
+ "h2",
58
+ "h3",
59
+ "h4",
60
+ "h5",
61
+ "h6",
62
+ "ol",
63
+ "p",
64
+ "pre",
65
+ "summary",
66
+ "table",
67
+ "ul",
68
+ }
69
+
70
+ # Block-level elements that should not appear inside <p>
71
+ _PARA_BREAKERS = {
72
+ "address",
73
+ "article",
74
+ "aside",
75
+ "blockquote",
76
+ "div",
77
+ "dl",
78
+ "fieldset",
79
+ "figcaption",
80
+ "figure",
81
+ "footer",
82
+ "form",
83
+ "h1",
84
+ "h2",
85
+ "h3",
86
+ "h4",
87
+ "h5",
88
+ "h6",
89
+ "header",
90
+ "hr",
91
+ "main",
92
+ "nav",
93
+ "ol",
94
+ "ul",
95
+ "li",
96
+ "p", # <p> inside <p> also forces closing
97
+ "pre",
98
+ "section",
99
+ "table",
100
+ "thead",
101
+ "tbody",
102
+ "tfoot",
103
+ "tr",
104
+ "td",
105
+ }
106
+
107
+ _CODE_TAG_SET: Final = {"code", "kbd", "samp"}
108
+
109
+ _FORMAT_TAG_MAP: Final = {
110
+ "b": {"bold": True},
111
+ "strong": {"bold": True},
112
+ "i": {"italic": True},
113
+ "em": {"italic": True},
114
+ "var": {"italic": True},
115
+ # "mark",
116
+ # "small",
117
+ "s": {"strikethrough": True},
118
+ "del": {"strikethrough": True},
119
+ "u": {"underline": True},
120
+ "ins": {"underline": True},
121
+ "sub": {"script": Script.SUB},
122
+ "sup": {"script": Script.SUPER},
123
+ **{k: {} for k in _CODE_TAG_SET},
124
+ }
125
+
126
+
127
+ class _Context(BaseModel):
128
+ list_ordered_flag_by_ref: dict[str, bool] = {}
129
+ list_start_by_ref: dict[str, int] = {}
130
+
131
+
132
+ class AnnotatedText(BaseModel):
133
+ text: str
134
+ hyperlink: Union[AnyUrl, Path, None] = None
135
+ formatting: Union[Formatting, None] = None
136
+ code: bool = False
137
+
138
+
139
+ class AnnotatedTextList(list):
140
+ def to_single_text_element(self) -> AnnotatedText:
141
+ current_h = None
142
+ current_text = ""
143
+ current_f = None
144
+ current_code = False
145
+ for at in self:
146
+ t = at.text
147
+ h = at.hyperlink
148
+ f = at.formatting
149
+ c = at.code
150
+ current_text += t.strip() + " "
151
+ if f is not None and current_f is None:
152
+ current_f = f
153
+ elif f is not None and current_f is not None and f != current_f:
154
+ _log.warning(
155
+ f"Clashing formatting: '{f}' and '{current_f}'! Chose '{current_f}'"
156
+ )
157
+ if h is not None and current_h is None:
158
+ current_h = h
159
+ elif h is not None and current_h is not None and h != current_h:
160
+ _log.warning(
161
+ f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
162
+ )
163
+ current_code = c if c else current_code
164
+
165
+ return AnnotatedText(
166
+ text=current_text.strip(),
167
+ hyperlink=current_h,
168
+ formatting=current_f,
169
+ code=current_code,
170
+ )
171
+
172
+ def simplify_text_elements(self) -> "AnnotatedTextList":
173
+ simplified = AnnotatedTextList()
174
+ if not self:
175
+ return self
176
+ text = self[0].text
177
+ hyperlink = self[0].hyperlink
178
+ formatting = self[0].formatting
179
+ code = self[0].code
180
+ last_elm = text
181
+ for i in range(1, len(self)):
182
+ if (
183
+ hyperlink == self[i].hyperlink
184
+ and formatting == self[i].formatting
185
+ and code == self[i].code
186
+ ):
187
+ sep = " "
188
+ if not self[i].text.strip() or not last_elm.strip():
189
+ sep = ""
190
+ text += sep + self[i].text
191
+ last_elm = self[i].text
192
+ else:
193
+ simplified.append(
194
+ AnnotatedText(
195
+ text=text, hyperlink=hyperlink, formatting=formatting, code=code
196
+ )
197
+ )
198
+ text = self[i].text
199
+ last_elm = text
200
+ hyperlink = self[i].hyperlink
201
+ formatting = self[i].formatting
202
+ code = self[i].code
203
+ if text:
204
+ simplified.append(
205
+ AnnotatedText(
206
+ text=text, hyperlink=hyperlink, formatting=formatting, code=code
207
+ )
208
+ )
209
+ return simplified
210
+
211
+ def split_by_newline(self):
212
+ super_list = []
213
+ active_annotated_text_list = AnnotatedTextList()
214
+ for el in self:
215
+ sub_texts = el.text.split("\n")
216
+ if len(sub_texts) == 1:
217
+ active_annotated_text_list.append(el)
218
+ else:
219
+ for text in sub_texts:
220
+ sub_el = deepcopy(el)
221
+ sub_el.text = text
222
+ active_annotated_text_list.append(sub_el)
223
+ super_list.append(active_annotated_text_list)
224
+ active_annotated_text_list = AnnotatedTextList()
225
+ if active_annotated_text_list:
226
+ super_list.append(active_annotated_text_list)
227
+ return super_list
228
+
229
+
230
+ class HTMLDocumentBackend(DeclarativeDocumentBackend):
231
+ @override
232
+ def __init__(
233
+ self,
234
+ in_doc: InputDocument,
235
+ path_or_stream: Union[BytesIO, Path],
236
+ options: HTMLBackendOptions = HTMLBackendOptions(),
237
+ ):
238
+ super().__init__(in_doc, path_or_stream, options)
239
+ self.options: HTMLBackendOptions
240
+ self.soup: Optional[BeautifulSoup] = None
241
+ self.path_or_stream: Union[BytesIO, Path] = path_or_stream
242
+ self.base_path: Optional[str] = str(options.source_uri)
243
+
244
+ # Initialize the parents for the hierarchy
245
+ self.max_levels = 10
246
+ self.level = 0
247
+ self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
248
+ self.ctx = _Context()
249
+ for i in range(self.max_levels):
250
+ self.parents[i] = None
251
+ self.hyperlink: Union[AnyUrl, Path, None] = None
252
+ self.format_tags: list[str] = []
253
+
254
+ try:
255
+ raw = (
256
+ path_or_stream.getvalue()
257
+ if isinstance(path_or_stream, BytesIO)
258
+ else Path(path_or_stream).read_bytes()
259
+ )
260
+ self.soup = BeautifulSoup(raw, "html.parser")
261
+ except Exception as e:
262
+ raise RuntimeError(
263
+ "Could not initialize HTML backend for file with "
264
+ f"hash {self.document_hash}."
265
+ ) from e
266
+
267
+ @override
268
+ def is_valid(self) -> bool:
269
+ return self.soup is not None
270
+
271
+ @classmethod
272
+ @override
273
+ def supports_pagination(cls) -> bool:
274
+ return False
275
+
276
+ @override
277
+ def unload(self):
278
+ if isinstance(self.path_or_stream, BytesIO):
279
+ self.path_or_stream.close()
280
+ self.path_or_stream = None
281
+
282
+ @classmethod
283
+ @override
284
+ def supported_formats(cls) -> set[InputFormat]:
285
+ return {InputFormat.HTML}
286
+
287
+ @override
288
+ def convert(self) -> DoclingDocument:
289
+ _log.debug("Starting HTML conversion...")
290
+ if not self.is_valid():
291
+ raise RuntimeError("Invalid HTML document.")
292
+
293
+ origin = DocumentOrigin(
294
+ filename=self.file.name or "file",
295
+ mimetype="text/html",
296
+ binary_hash=self.document_hash,
297
+ )
298
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
299
+
300
+ assert self.soup is not None
301
+ # set the title as furniture, since it is part of the document metadata
302
+ title = self.soup.title
303
+ if title and self.options.add_title:
304
+ title_text = title.get_text(separator=" ", strip=True)
305
+ title_clean = HTMLDocumentBackend._clean_unicode(title_text)
306
+ doc.add_title(
307
+ text=title_clean,
308
+ orig=title_text,
309
+ content_layer=ContentLayer.FURNITURE,
310
+ )
311
+ # remove script and style tags
312
+ for tag in self.soup(["script", "noscript", "style"]):
313
+ tag.decompose()
314
+ # remove any hidden tag
315
+ for tag in self.soup(hidden=True):
316
+ tag.decompose()
317
+ # fix flow content that is not permitted inside <p>
318
+ HTMLDocumentBackend._fix_invalid_paragraph_structure(self.soup)
319
+
320
+ content = self.soup.body or self.soup
321
+ # normalize <br> tags
322
+ for br in content("br"):
323
+ br.replace_with(NavigableString("\n"))
324
+ # set default content layer
325
+
326
+ # Furniture before the first heading rule, except for headers in tables
327
+ header = None
328
+ # Find all headers first
329
+ all_headers = content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
330
+ # Keep only those that do NOT have a <table> in a parent chain
331
+ clean_headers = [h for h in all_headers if not h.find_parent("table")]
332
+ # Pick the first header from the remaining
333
+ if len(clean_headers):
334
+ header = clean_headers[0]
335
+ # Set starting content layer
336
+ self.content_layer = (
337
+ ContentLayer.BODY
338
+ if (not self.options.infer_furniture) or (header is None)
339
+ else ContentLayer.FURNITURE
340
+ )
341
+ # reset context
342
+ self.ctx = _Context()
343
+ self._walk(content, doc)
344
+ return doc
345
+
346
+ @staticmethod
347
+ def _fix_invalid_paragraph_structure(soup: BeautifulSoup) -> None:
348
+ """Rewrite <p> elements that contain block-level breakers.
349
+
350
+ This function emulates browser logic when other block-level elements
351
+ are found inside a <p> element.
352
+ When a <p> is open and a block-level breaker (e.g., h1-h6, div, table)
353
+ appears, automatically close the <p>, emit it, then emit the breaker,
354
+ and if needed open a new <p> for trailing text.
355
+
356
+ Args:
357
+ soup: The HTML document. The DOM may be rewritten.
358
+ """
359
+
360
+ def _start_para():
361
+ nonlocal current_p
362
+ if current_p is None:
363
+ current_p = soup.new_tag("p")
364
+ new_nodes.append(current_p)
365
+
366
+ def _flush_para_if_empty():
367
+ nonlocal current_p
368
+ if current_p is not None and not current_p.get_text(strip=True):
369
+ # remove empty paragraph placeholder
370
+ if current_p in new_nodes:
371
+ new_nodes.remove(current_p)
372
+ current_p = None
373
+
374
+ paragraphs = soup.select(f"p:has({','.join(tag for tag in _PARA_BREAKERS)})")
375
+
376
+ for p in paragraphs:
377
+ parent = p.parent
378
+ if parent is None:
379
+ continue
380
+
381
+ new_nodes = []
382
+ current_p = None
383
+
384
+ for node in list(p.contents):
385
+ if isinstance(node, NavigableString):
386
+ text = str(node)
387
+ node.extract()
388
+ if text.strip():
389
+ _start_para()
390
+ if current_p is not None:
391
+ current_p.append(NavigableString(text))
392
+ # skip whitespace-only text
393
+ continue
394
+
395
+ if isinstance(node, Tag):
396
+ node.extract()
397
+
398
+ if node.name in _PARA_BREAKERS:
399
+ _flush_para_if_empty()
400
+ new_nodes.append(node)
401
+ continue
402
+ else:
403
+ _start_para()
404
+ if current_p is not None:
405
+ current_p.append(node)
406
+ continue
407
+
408
+ _flush_para_if_empty()
409
+
410
+ siblings = list(parent.children)
411
+ try:
412
+ idx = siblings.index(p)
413
+ except ValueError:
414
+ # p might have been removed
415
+ continue
416
+
417
+ p.extract()
418
+ for n in reversed(new_nodes):
419
+ parent.insert(idx, n)
420
+
421
+ @staticmethod
422
+ def _is_remote_url(value: str) -> bool:
423
+ parsed = urlparse(value)
424
+ return parsed.scheme in {"http", "https", "ftp", "s3", "gs"}
425
+
426
+ def _resolve_relative_path(self, loc: str) -> str:
427
+ abs_loc = loc
428
+
429
+ if self.base_path:
430
+ if loc.startswith("//"):
431
+ # Protocol-relative URL - default to https
432
+ abs_loc = "https:" + loc
433
+ elif not loc.startswith(("http://", "https://", "data:", "file://")):
434
+ if HTMLDocumentBackend._is_remote_url(self.base_path): # remote fetch
435
+ abs_loc = urljoin(self.base_path, loc)
436
+ elif self.base_path: # local fetch
437
+ # For local files, resolve relative to the HTML file location
438
+ abs_loc = str(Path(self.base_path).parent / loc)
439
+
440
+ _log.debug(f"Resolved location {loc} to {abs_loc}")
441
+ return abs_loc
442
+
443
+ @staticmethod
444
+ def group_cell_elements(
445
+ group_name: str,
446
+ doc: DoclingDocument,
447
+ provs_in_cell: list[RefItem],
448
+ docling_table: TableItem,
449
+ ) -> RefItem:
450
+ group_element = doc.add_group(
451
+ label=GroupLabel.UNSPECIFIED,
452
+ name=group_name,
453
+ parent=docling_table,
454
+ )
455
+ for prov in provs_in_cell:
456
+ group_element.children.append(prov)
457
+ pr_item = prov.resolve(doc)
458
+ item_parent = pr_item.parent.resolve(doc)
459
+ if pr_item.get_ref() in item_parent.children:
460
+ item_parent.children.remove(pr_item.get_ref())
461
+ pr_item.parent = group_element.get_ref()
462
+ ref_for_rich_cell = group_element.get_ref()
463
+ return ref_for_rich_cell
464
+
465
+ @staticmethod
466
+ def process_rich_table_cells(
467
+ provs_in_cell: list[RefItem],
468
+ group_name: str,
469
+ doc: DoclingDocument,
470
+ docling_table: TableItem,
471
+ ) -> tuple[bool, Union[RefItem, None]]:
472
+ rich_table_cell = False
473
+ ref_for_rich_cell = None
474
+ if len(provs_in_cell) >= 1:
475
+ # Cell rich cell has multiple elements, we need to group them
476
+ rich_table_cell = True
477
+ ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
478
+ group_name, doc, provs_in_cell, docling_table
479
+ )
480
+
481
+ return rich_table_cell, ref_for_rich_cell
482
+
483
+ def _is_rich_table_cell(self, table_cell: Tag) -> bool:
484
+ """Determine whether an table cell should be parsed as a Docling RichTableCell.
485
+
486
+ A table cell can hold rich content and be parsed with a Docling RichTableCell.
487
+ However, this requires walking through the content elements and creating
488
+ Docling node items. If the cell holds only plain text, the parsing is simpler
489
+ and using a TableCell is prefered.
490
+
491
+ Args:
492
+ table_cell: The HTML tag representing a table cell.
493
+
494
+ Returns:
495
+ Whether the cell should be parsed as RichTableCell.
496
+ """
497
+ is_rich: bool = True
498
+
499
+ children = table_cell.find_all(recursive=True) # all descendants of type Tag
500
+ if not children:
501
+ content = [
502
+ item
503
+ for item in table_cell.contents
504
+ if isinstance(item, NavigableString)
505
+ ]
506
+ is_rich = len(content) > 1
507
+ else:
508
+ annotations = self._extract_text_and_hyperlink_recursively(
509
+ table_cell, find_parent_annotation=True
510
+ )
511
+ if not annotations:
512
+ is_rich = bool(item for item in children if item.name == "img")
513
+ elif len(annotations) == 1:
514
+ anno: AnnotatedText = annotations[0]
515
+ is_rich = bool(anno.formatting) or bool(anno.hyperlink) or anno.code
516
+
517
+ return is_rich
518
+
519
+ def parse_table_data(
520
+ self,
521
+ element: Tag,
522
+ doc: DoclingDocument,
523
+ docling_table: TableItem,
524
+ num_rows: int,
525
+ num_cols: int,
526
+ ) -> Optional[TableData]:
527
+ for t in cast(list[Tag], element.find_all(["thead", "tbody"], recursive=False)):
528
+ t.unwrap()
529
+
530
+ _log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
531
+ grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
532
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
533
+
534
+ # Iterate over the rows in the table
535
+ start_row_span = 0
536
+ row_idx = -1
537
+
538
+ # We don't want this recursive to support nested tables
539
+ for row in element("tr", recursive=False):
540
+ if not isinstance(row, Tag):
541
+ continue
542
+ # For each row, find all the column cells (both <td> and <th>)
543
+ # We don't want this recursive to support nested tables
544
+ cells = row(["td", "th"], recursive=False)
545
+ # Check if cell is in a column header or row header
546
+ col_header = True
547
+ row_header = True
548
+ for html_cell in cells:
549
+ if isinstance(html_cell, Tag):
550
+ _, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
551
+ if html_cell.name == "td":
552
+ col_header = False
553
+ row_header = False
554
+ elif row_span == 1:
555
+ row_header = False
556
+ if not row_header:
557
+ row_idx += 1
558
+ start_row_span = 0
559
+ else:
560
+ start_row_span += 1
561
+
562
+ # Extract the text content of each cell
563
+ col_idx = 0
564
+ for html_cell in cells:
565
+ if not isinstance(html_cell, Tag):
566
+ continue
567
+
568
+ # extract inline formulas
569
+ for formula in html_cell("inline-formula"):
570
+ math_parts = formula.text.split("$$")
571
+ if len(math_parts) == 3:
572
+ math_formula = f"$${math_parts[1]}$$"
573
+ formula.replace_with(NavigableString(math_formula))
574
+
575
+ provs_in_cell: list[RefItem] = []
576
+ rich_table_cell = self._is_rich_table_cell(html_cell)
577
+ if rich_table_cell:
578
+ # Parse table cell sub-tree for Rich Cells content:
579
+ with self._use_table_cell_context():
580
+ provs_in_cell = self._walk(html_cell, doc)
581
+
582
+ group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
583
+ rich_table_cell, ref_for_rich_cell = (
584
+ HTMLDocumentBackend.process_rich_table_cells(
585
+ provs_in_cell, group_name, doc, docling_table
586
+ )
587
+ )
588
+
589
+ # Extracting text
590
+ text = HTMLDocumentBackend._clean_unicode(
591
+ self.get_text(html_cell).strip()
592
+ )
593
+ col_span, row_span = self._get_cell_spans(html_cell)
594
+ if row_header:
595
+ row_span -= 1
596
+ while (
597
+ col_idx < num_cols
598
+ and grid[row_idx + start_row_span][col_idx] is not None
599
+ ):
600
+ col_idx += 1
601
+ for r in range(start_row_span, start_row_span + row_span):
602
+ for c in range(col_span):
603
+ if row_idx + r < num_rows and col_idx + c < num_cols:
604
+ grid[row_idx + r][col_idx + c] = text
605
+
606
+ if rich_table_cell:
607
+ rich_cell = RichTableCell(
608
+ text=text,
609
+ row_span=row_span,
610
+ col_span=col_span,
611
+ start_row_offset_idx=start_row_span + row_idx,
612
+ end_row_offset_idx=start_row_span + row_idx + row_span,
613
+ start_col_offset_idx=col_idx,
614
+ end_col_offset_idx=col_idx + col_span,
615
+ column_header=col_header,
616
+ row_header=((not col_header) and html_cell.name == "th"),
617
+ ref=ref_for_rich_cell, # points to an artificial group around children
618
+ )
619
+ doc.add_table_cell(table_item=docling_table, cell=rich_cell)
620
+ else:
621
+ simple_cell = TableCell(
622
+ text=text,
623
+ row_span=row_span,
624
+ col_span=col_span,
625
+ start_row_offset_idx=start_row_span + row_idx,
626
+ end_row_offset_idx=start_row_span + row_idx + row_span,
627
+ start_col_offset_idx=col_idx,
628
+ end_col_offset_idx=col_idx + col_span,
629
+ column_header=col_header,
630
+ row_header=((not col_header) and html_cell.name == "th"),
631
+ )
632
+ doc.add_table_cell(table_item=docling_table, cell=simple_cell)
633
+ return data
634
+
635
+ def _walk(self, element: Tag, doc: DoclingDocument) -> list[RefItem]:
636
+ """Parse an XML tag by recursively walking its content.
637
+
638
+ While walking, the method buffers inline text across tags like <b> or <span>,
639
+ emitting text nodes only at block boundaries.
640
+
641
+ Args:
642
+ element: The XML tag to parse.
643
+ doc: The Docling document to be updated with the parsed content.
644
+ """
645
+ added_refs: list[RefItem] = []
646
+ buffer: AnnotatedTextList = AnnotatedTextList()
647
+
648
+ def _flush_buffer() -> None:
649
+ if not buffer:
650
+ return
651
+ annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
652
+ parts = annotated_text_list.split_by_newline()
653
+ buffer.clear()
654
+
655
+ if not "".join([el.text for el in annotated_text_list]):
656
+ return
657
+
658
+ for annotated_text_list in parts:
659
+ with self._use_inline_group(annotated_text_list, doc):
660
+ for annotated_text in annotated_text_list:
661
+ if annotated_text.text.strip():
662
+ seg_clean = HTMLDocumentBackend._clean_unicode(
663
+ annotated_text.text.strip()
664
+ )
665
+ if annotated_text.code:
666
+ docling_code2 = doc.add_code(
667
+ parent=self.parents[self.level],
668
+ text=seg_clean,
669
+ content_layer=self.content_layer,
670
+ formatting=annotated_text.formatting,
671
+ hyperlink=annotated_text.hyperlink,
672
+ )
673
+ added_refs.append(docling_code2.get_ref())
674
+ else:
675
+ docling_text2 = doc.add_text(
676
+ parent=self.parents[self.level],
677
+ label=DocItemLabel.TEXT,
678
+ text=seg_clean,
679
+ content_layer=self.content_layer,
680
+ formatting=annotated_text.formatting,
681
+ hyperlink=annotated_text.hyperlink,
682
+ )
683
+ added_refs.append(docling_text2.get_ref())
684
+
685
+ for node in element.contents:
686
+ if isinstance(node, Tag):
687
+ name = node.name.lower()
688
+ if name == "img":
689
+ _flush_buffer()
690
+ im_ref3 = self._emit_image(node, doc)
691
+ if im_ref3:
692
+ added_refs.append(im_ref3)
693
+ elif name in _FORMAT_TAG_MAP:
694
+ _flush_buffer()
695
+ with self._use_format([name]):
696
+ wk = self._walk(node, doc)
697
+ added_refs.extend(wk)
698
+ elif name == "a":
699
+ with self._use_hyperlink(node):
700
+ wk2 = self._walk(node, doc)
701
+ added_refs.extend(wk2)
702
+ elif name in _BLOCK_TAGS:
703
+ _flush_buffer()
704
+ blk = self._handle_block(node, doc)
705
+ added_refs.extend(blk)
706
+ elif node.find(_BLOCK_TAGS):
707
+ _flush_buffer()
708
+ wk3 = self._walk(node, doc)
709
+ added_refs.extend(wk3)
710
+ else:
711
+ buffer.extend(
712
+ self._extract_text_and_hyperlink_recursively(
713
+ node, find_parent_annotation=True, keep_newlines=True
714
+ )
715
+ )
716
+ elif isinstance(node, NavigableString) and not isinstance(
717
+ node, PreformattedString
718
+ ):
719
+ if str(node).strip("\n\r") == "":
720
+ _flush_buffer()
721
+ else:
722
+ buffer.extend(
723
+ self._extract_text_and_hyperlink_recursively(
724
+ node, find_parent_annotation=True, keep_newlines=True
725
+ )
726
+ )
727
+
728
+ _flush_buffer()
729
+ return added_refs
730
+
731
+ @staticmethod
732
+ def _collect_parent_format_tags(item: PageElement) -> list[str]:
733
+ tags = []
734
+ for format_tag in _FORMAT_TAG_MAP:
735
+ this_parent = item.parent
736
+ while this_parent is not None:
737
+ if this_parent.name == format_tag:
738
+ tags.append(format_tag)
739
+ break
740
+ this_parent = this_parent.parent
741
+ return tags
742
+
743
+ @property
744
+ def _formatting(self):
745
+ kwargs = {}
746
+ for t in self.format_tags:
747
+ kwargs.update(_FORMAT_TAG_MAP[t])
748
+ if not kwargs:
749
+ return None
750
+ return Formatting(**kwargs)
751
+
752
+ def _extract_text_and_hyperlink_recursively(
753
+ self,
754
+ item: PageElement,
755
+ ignore_list=False,
756
+ find_parent_annotation=False,
757
+ keep_newlines=False,
758
+ ) -> AnnotatedTextList:
759
+ result: AnnotatedTextList = AnnotatedTextList()
760
+
761
+ # If find_parent_annotation, make sure that we keep track of
762
+ # any a- or formatting-tag that has been present in the
763
+ # DOM-parents already.
764
+ if find_parent_annotation:
765
+ format_tags = self._collect_parent_format_tags(item)
766
+ this_parent = item.parent
767
+ while this_parent is not None:
768
+ if this_parent.name == "a" and this_parent.get("href"):
769
+ with self._use_format(format_tags):
770
+ with self._use_hyperlink(this_parent):
771
+ return self._extract_text_and_hyperlink_recursively(
772
+ item, ignore_list
773
+ )
774
+ this_parent = this_parent.parent
775
+
776
+ if isinstance(item, PreformattedString):
777
+ return AnnotatedTextList()
778
+
779
+ if isinstance(item, NavigableString):
780
+ text = item.strip()
781
+ code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
782
+ if text:
783
+ return AnnotatedTextList(
784
+ [
785
+ AnnotatedText(
786
+ text=text,
787
+ hyperlink=self.hyperlink,
788
+ formatting=self._formatting,
789
+ code=code,
790
+ )
791
+ ]
792
+ )
793
+ if keep_newlines and item.strip("\n\r") == "":
794
+ return AnnotatedTextList(
795
+ [
796
+ AnnotatedText(
797
+ text="\n",
798
+ hyperlink=self.hyperlink,
799
+ formatting=self._formatting,
800
+ code=code,
801
+ )
802
+ ]
803
+ )
804
+ return AnnotatedTextList()
805
+
806
+ tag = cast(Tag, item)
807
+ if not ignore_list or (tag.name not in ["ul", "ol"]):
808
+ for child in tag:
809
+ if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
810
+ with self._use_format([child.name]):
811
+ result.extend(
812
+ self._extract_text_and_hyperlink_recursively(
813
+ child, ignore_list, keep_newlines=keep_newlines
814
+ )
815
+ )
816
+ elif isinstance(child, Tag) and child.name == "a":
817
+ with self._use_hyperlink(child):
818
+ result.extend(
819
+ self._extract_text_and_hyperlink_recursively(
820
+ child, ignore_list, keep_newlines=keep_newlines
821
+ )
822
+ )
823
+ else:
824
+ # Recursively get the child's text content
825
+ result.extend(
826
+ self._extract_text_and_hyperlink_recursively(
827
+ child, ignore_list, keep_newlines=keep_newlines
828
+ )
829
+ )
830
+ return result
831
+
832
+ @contextmanager
833
+ def _use_hyperlink(self, tag: Tag):
834
+ old_hyperlink: Union[AnyUrl, Path, None] = None
835
+ new_hyperlink: Union[AnyUrl, Path, None] = None
836
+ this_href = tag.get("href")
837
+ if this_href is None:
838
+ yield None
839
+ else:
840
+ if isinstance(this_href, str) and this_href:
841
+ old_hyperlink = self.hyperlink
842
+ this_href = self._resolve_relative_path(this_href)
843
+ # ugly fix for relative links since pydantic does not support them.
844
+ try:
845
+ new_hyperlink = AnyUrl(this_href)
846
+ except ValidationError:
847
+ new_hyperlink = Path(this_href)
848
+ self.hyperlink = new_hyperlink
849
+ try:
850
+ yield None
851
+ finally:
852
+ if new_hyperlink:
853
+ self.hyperlink = old_hyperlink
854
+
855
+ @contextmanager
856
+ def _use_format(self, tags: list[str]):
857
+ if not tags:
858
+ yield None
859
+ else:
860
+ self.format_tags.extend(tags)
861
+ try:
862
+ yield None
863
+ finally:
864
+ self.format_tags = self.format_tags[: -len(tags)]
865
+
866
+ @contextmanager
867
+ def _use_inline_group(
868
+ self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
869
+ ):
870
+ """Create an inline group for annotated texts.
871
+
872
+ Checks if annotated_text_list has more than one item and if so creates an inline
873
+ group in which the text elements can then be generated. While the context manager
874
+ is active the inline group is set as the current parent.
875
+
876
+ Args:
877
+ annotated_text_list (AnnotatedTextList): Annotated text
878
+ doc (DoclingDocument): Currently used document
879
+ """
880
+ if len(annotated_text_list) > 1:
881
+ inline_fmt = doc.add_group(
882
+ label=GroupLabel.INLINE,
883
+ parent=self.parents[self.level],
884
+ content_layer=self.content_layer,
885
+ )
886
+ self.parents[self.level + 1] = inline_fmt
887
+ self.level += 1
888
+ try:
889
+ yield None
890
+ finally:
891
+ self.parents[self.level] = None
892
+ self.level -= 1
893
+ else:
894
+ yield None
895
+
896
+ @contextmanager
897
+ def _use_details(self, tag: Tag, doc: DoclingDocument):
898
+ """Create a group with the content of a details tag.
899
+
900
+ While the context manager is active, the hierarchy level is set one
901
+ level higher as the cuurent parent.
902
+
903
+ Args:
904
+ tag: The details tag.
905
+ doc: Currently used document.
906
+ """
907
+ self.parents[self.level + 1] = doc.add_group(
908
+ name=tag.name,
909
+ label=GroupLabel.SECTION,
910
+ parent=self.parents[self.level],
911
+ content_layer=self.content_layer,
912
+ )
913
+ self.level += 1
914
+ try:
915
+ yield None
916
+ finally:
917
+ self.parents[self.level + 1] = None
918
+ self.level -= 1
919
+
920
+ @contextmanager
921
+ def _use_footer(self, tag: Tag, doc: DoclingDocument):
922
+ """Create a group with a footer.
923
+
924
+ Create a group with the content of a footer tag. While the context manager
925
+ is active, the hierarchy level is set one level higher as the cuurent parent.
926
+
927
+ Args:
928
+ tag: The footer tag.
929
+ doc: Currently used document.
930
+ """
931
+ current_layer = self.content_layer
932
+ self.content_layer = ContentLayer.FURNITURE
933
+ self.parents[self.level + 1] = doc.add_group(
934
+ name=tag.name,
935
+ label=GroupLabel.SECTION,
936
+ parent=self.parents[self.level],
937
+ content_layer=self.content_layer,
938
+ )
939
+ self.level += 1
940
+ try:
941
+ yield None
942
+ finally:
943
+ self.parents[self.level + 1] = None
944
+ self.level -= 1
945
+ self.content_layer = current_layer
946
+
947
+ @contextmanager
948
+ def _use_table_cell_context(self):
949
+ """Preserve the hierarchy level and parents during table cell processing.
950
+
951
+ While the context manager is active, the hierarchy level and parents can be modified.
952
+ When exiting, the original level and parents are restored.
953
+ """
954
+ original_level = self.level
955
+ original_parents = self.parents.copy()
956
+ try:
957
+ yield
958
+ finally:
959
+ self.level = original_level
960
+ self.parents = original_parents
961
+
962
+ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
963
+ added_ref = []
964
+ tag_name = tag.name.lower()
965
+ # set default content layer to BODY as soon as we encounter a heading
966
+ self.content_layer = ContentLayer.BODY
967
+ level = int(tag_name[1])
968
+ annotated_text_list = self._extract_text_and_hyperlink_recursively(
969
+ tag, find_parent_annotation=True
970
+ )
971
+ annotated_text = annotated_text_list.to_single_text_element()
972
+ text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
973
+ # the first level is for the title item
974
+ if level == 1:
975
+ for key in self.parents.keys():
976
+ self.parents[key] = None
977
+ self.level = 0
978
+ self.parents[self.level + 1] = doc.add_title(
979
+ text_clean,
980
+ content_layer=self.content_layer,
981
+ formatting=annotated_text.formatting,
982
+ hyperlink=annotated_text.hyperlink,
983
+ )
984
+ p1 = self.parents[self.level + 1]
985
+ if p1 is not None:
986
+ added_ref = [p1.get_ref()]
987
+ # the other levels need to be lowered by 1 if a title was set
988
+ else:
989
+ level -= 1
990
+ if level > self.level:
991
+ # add invisible group
992
+ for i in range(self.level, level):
993
+ _log.debug(f"Adding invisible group to level {i}")
994
+ self.parents[i + 1] = doc.add_group(
995
+ name=f"header-{i + 1}",
996
+ label=GroupLabel.SECTION,
997
+ parent=self.parents[i],
998
+ content_layer=self.content_layer,
999
+ )
1000
+ self.level = level
1001
+ elif level < self.level:
1002
+ # remove the tail
1003
+ for key in self.parents.keys():
1004
+ if key > level + 1:
1005
+ _log.debug(f"Remove the tail of level {key}")
1006
+ self.parents[key] = None
1007
+ self.level = level
1008
+ self.parents[self.level + 1] = doc.add_heading(
1009
+ parent=self.parents[self.level],
1010
+ text=text_clean,
1011
+ orig=annotated_text.text,
1012
+ level=self.level,
1013
+ content_layer=self.content_layer,
1014
+ formatting=annotated_text.formatting,
1015
+ hyperlink=annotated_text.hyperlink,
1016
+ )
1017
+ p2 = self.parents[self.level + 1]
1018
+ if p2 is not None:
1019
+ added_ref = [p2.get_ref()]
1020
+ self.level += 1
1021
+ for img_tag in tag("img"):
1022
+ if isinstance(img_tag, Tag):
1023
+ im_ref = self._emit_image(img_tag, doc)
1024
+ if im_ref:
1025
+ added_ref.append(im_ref)
1026
+ return added_ref
1027
+
1028
+ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
1029
+ tag_name = tag.name.lower()
1030
+ start: Optional[int] = None
1031
+ name: str = ""
1032
+ is_ordered = tag_name == "ol"
1033
+ if is_ordered:
1034
+ start_attr = tag.get("start")
1035
+ if isinstance(start_attr, str) and start_attr.isnumeric():
1036
+ start = int(start_attr)
1037
+ name = "ordered list" + (f" start {start}" if start is not None else "")
1038
+ else:
1039
+ name = "list"
1040
+ # Create the list container
1041
+ list_group = doc.add_list_group(
1042
+ name=name,
1043
+ parent=self.parents[self.level],
1044
+ content_layer=self.content_layer,
1045
+ )
1046
+ self.parents[self.level + 1] = list_group
1047
+ self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
1048
+ if is_ordered and start is not None:
1049
+ self.ctx.list_start_by_ref[list_group.self_ref] = start
1050
+ self.level += 1
1051
+
1052
+ # For each top-level <li> in this list
1053
+ for li in tag.find_all({"li", "ul", "ol"}, recursive=False):
1054
+ if not isinstance(li, Tag):
1055
+ continue
1056
+
1057
+ # sub-list items should be indented under main list items, but temporarily
1058
+ # addressing invalid HTML (docling-core/issues/357)
1059
+ if li.name in {"ul", "ol"}:
1060
+ self._handle_block(li, doc)
1061
+
1062
+ else:
1063
+ # 1) determine the marker
1064
+ if is_ordered and start is not None:
1065
+ marker = f"{start + len(list_group.children)}."
1066
+ else:
1067
+ marker = ""
1068
+
1069
+ # 2) extract only the "direct" text from this <li>
1070
+ parts = self._extract_text_and_hyperlink_recursively(
1071
+ li, ignore_list=True, find_parent_annotation=True
1072
+ )
1073
+ min_parts = parts.simplify_text_elements()
1074
+ li_text = re.sub(
1075
+ r"\s+|\n+", " ", "".join([el.text for el in min_parts])
1076
+ ).strip()
1077
+
1078
+ # 3) add the list item
1079
+ if li_text:
1080
+ if len(min_parts) > 1:
1081
+ # create an empty list element in order to hook the inline group onto that one
1082
+ self.parents[self.level + 1] = doc.add_list_item(
1083
+ text="",
1084
+ enumerated=is_ordered,
1085
+ marker=marker,
1086
+ parent=list_group,
1087
+ content_layer=self.content_layer,
1088
+ )
1089
+ self.level += 1
1090
+ with self._use_inline_group(min_parts, doc):
1091
+ for annotated_text in min_parts:
1092
+ li_text = re.sub(
1093
+ r"\s+|\n+", " ", annotated_text.text
1094
+ ).strip()
1095
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
1096
+ if annotated_text.code:
1097
+ doc.add_code(
1098
+ parent=self.parents[self.level],
1099
+ text=li_clean,
1100
+ content_layer=self.content_layer,
1101
+ formatting=annotated_text.formatting,
1102
+ hyperlink=annotated_text.hyperlink,
1103
+ )
1104
+ else:
1105
+ doc.add_text(
1106
+ parent=self.parents[self.level],
1107
+ label=DocItemLabel.TEXT,
1108
+ text=li_clean,
1109
+ content_layer=self.content_layer,
1110
+ formatting=annotated_text.formatting,
1111
+ hyperlink=annotated_text.hyperlink,
1112
+ )
1113
+
1114
+ # 4) recurse into any nested lists, attaching them to this <li> item
1115
+ for sublist in li({"ul", "ol"}, recursive=False):
1116
+ if isinstance(sublist, Tag):
1117
+ self._handle_block(sublist, doc)
1118
+
1119
+ # now the list element with inline group is not a parent anymore
1120
+ self.parents[self.level] = None
1121
+ self.level -= 1
1122
+ else:
1123
+ annotated_text = min_parts[0]
1124
+ li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
1125
+ li_clean = HTMLDocumentBackend._clean_unicode(li_text)
1126
+ self.parents[self.level + 1] = doc.add_list_item(
1127
+ text=li_clean,
1128
+ enumerated=is_ordered,
1129
+ marker=marker,
1130
+ orig=li_text,
1131
+ parent=list_group,
1132
+ content_layer=self.content_layer,
1133
+ formatting=annotated_text.formatting,
1134
+ hyperlink=annotated_text.hyperlink,
1135
+ )
1136
+
1137
+ # 4) recurse into any nested lists, attaching them to this <li> item
1138
+ for sublist in li({"ul", "ol"}, recursive=False):
1139
+ if isinstance(sublist, Tag):
1140
+ self.level += 1
1141
+ self._handle_block(sublist, doc)
1142
+ self.parents[self.level + 1] = None
1143
+ self.level -= 1
1144
+ else:
1145
+ for sublist in li({"ul", "ol"}, recursive=False):
1146
+ if isinstance(sublist, Tag):
1147
+ self._handle_block(sublist, doc)
1148
+
1149
+ # 5) extract any images under this <li>
1150
+ for img_tag in li("img"):
1151
+ if isinstance(img_tag, Tag):
1152
+ self._emit_image(img_tag, doc)
1153
+
1154
+ self.parents[self.level + 1] = None
1155
+ self.level -= 1
1156
+ return list_group.get_ref()
1157
+
1158
+ @staticmethod
1159
+ def get_html_table_row_col(tag: Tag) -> tuple[int, int]:
1160
+ for t in cast(list[Tag], tag.find_all(["thead", "tbody"], recursive=False)):
1161
+ t.unwrap()
1162
+ # Find the number of rows and columns (taking into account spans)
1163
+ num_rows: int = 0
1164
+ num_cols: int = 0
1165
+ for row in tag("tr", recursive=False):
1166
+ col_count = 0
1167
+ is_row_header = True
1168
+ if not isinstance(row, Tag):
1169
+ continue
1170
+ for cell in row(["td", "th"], recursive=False):
1171
+ if not isinstance(row, Tag):
1172
+ continue
1173
+ cell_tag = cast(Tag, cell)
1174
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
1175
+ col_count += col_span
1176
+ if cell_tag.name == "td" or row_span == 1:
1177
+ is_row_header = False
1178
+ num_cols = max(num_cols, col_count)
1179
+ if not is_row_header:
1180
+ num_rows += 1
1181
+ return num_rows, num_cols
1182
+
1183
+ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
1184
+ added_refs = []
1185
+ tag_name = tag.name.lower()
1186
+
1187
+ if tag_name == "figure":
1188
+ img_tag = tag.find("img")
1189
+ if isinstance(img_tag, Tag):
1190
+ im_ref = self._emit_image(img_tag, doc)
1191
+ if im_ref is not None:
1192
+ added_refs.append(im_ref)
1193
+
1194
+ elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
1195
+ heading_refs = self._handle_heading(tag, doc)
1196
+ added_refs.extend(heading_refs)
1197
+
1198
+ elif tag_name in {"ul", "ol"}:
1199
+ list_ref = self._handle_list(tag, doc)
1200
+ added_refs.append(list_ref)
1201
+
1202
+ elif tag_name in {"p", "address", "summary"}:
1203
+ text_list = self._extract_text_and_hyperlink_recursively(
1204
+ tag, find_parent_annotation=True
1205
+ )
1206
+ annotated_texts: AnnotatedTextList = text_list.simplify_text_elements()
1207
+ for part in annotated_texts.split_by_newline():
1208
+ with self._use_inline_group(part, doc):
1209
+ for annotated_text in part:
1210
+ if seg := annotated_text.text.strip():
1211
+ seg_clean = HTMLDocumentBackend._clean_unicode(seg)
1212
+ if annotated_text.code:
1213
+ docling_code = doc.add_code(
1214
+ parent=self.parents[self.level],
1215
+ text=seg_clean,
1216
+ content_layer=self.content_layer,
1217
+ formatting=annotated_text.formatting,
1218
+ hyperlink=annotated_text.hyperlink,
1219
+ )
1220
+ added_refs.append(docling_code.get_ref())
1221
+ else:
1222
+ docling_text = doc.add_text(
1223
+ parent=self.parents[self.level],
1224
+ label=DocItemLabel.TEXT,
1225
+ text=seg_clean,
1226
+ content_layer=self.content_layer,
1227
+ formatting=annotated_text.formatting,
1228
+ hyperlink=annotated_text.hyperlink,
1229
+ )
1230
+ added_refs.append(docling_text.get_ref())
1231
+
1232
+ for img_tag in tag("img"):
1233
+ if isinstance(img_tag, Tag):
1234
+ self._emit_image(img_tag, doc)
1235
+
1236
+ elif tag_name == "table":
1237
+ num_rows, num_cols = self.get_html_table_row_col(tag)
1238
+ data_e = TableData(num_rows=num_rows, num_cols=num_cols)
1239
+ docling_table = doc.add_table(
1240
+ data=data_e,
1241
+ parent=self.parents[self.level],
1242
+ content_layer=self.content_layer,
1243
+ )
1244
+ added_refs.append(docling_table.get_ref())
1245
+ self.parse_table_data(tag, doc, docling_table, num_rows, num_cols)
1246
+
1247
+ for img_tag in tag("img"):
1248
+ if isinstance(img_tag, Tag):
1249
+ im_ref2 = self._emit_image(tag, doc)
1250
+ if im_ref2 is not None:
1251
+ added_refs.append(im_ref2)
1252
+
1253
+ elif tag_name in {"pre"}:
1254
+ # handle monospace code snippets (pre).
1255
+ text_list = self._extract_text_and_hyperlink_recursively(
1256
+ tag, find_parent_annotation=True, keep_newlines=True
1257
+ )
1258
+ annotated_texts = text_list.simplify_text_elements()
1259
+ with self._use_inline_group(annotated_texts, doc):
1260
+ for annotated_text in annotated_texts:
1261
+ text_clean = HTMLDocumentBackend._clean_unicode(
1262
+ annotated_text.text.strip()
1263
+ )
1264
+ docling_code2 = doc.add_code(
1265
+ parent=self.parents[self.level],
1266
+ text=text_clean,
1267
+ content_layer=self.content_layer,
1268
+ formatting=annotated_text.formatting,
1269
+ hyperlink=annotated_text.hyperlink,
1270
+ )
1271
+ added_refs.append(docling_code2.get_ref())
1272
+
1273
+ elif tag_name == "footer":
1274
+ with self._use_footer(tag, doc):
1275
+ self._walk(tag, doc)
1276
+
1277
+ elif tag_name == "details":
1278
+ with self._use_details(tag, doc):
1279
+ self._walk(tag, doc)
1280
+ return added_refs
1281
+
1282
+ def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]:
1283
+ figure = img_tag.find_parent("figure")
1284
+ caption: AnnotatedTextList = AnnotatedTextList()
1285
+
1286
+ parent = self.parents[self.level]
1287
+
1288
+ # check if the figure has a link - this is HACK:
1289
+ def get_img_hyperlink(img_tag):
1290
+ this_parent = img_tag.parent
1291
+ while this_parent is not None:
1292
+ if this_parent.name == "a" and this_parent.get("href"):
1293
+ return this_parent.get("href")
1294
+ this_parent = this_parent.parent
1295
+ return None
1296
+
1297
+ if img_hyperlink := get_img_hyperlink(img_tag):
1298
+ img_text = img_tag.get("alt") or ""
1299
+ caption.append(AnnotatedText(text=img_text, hyperlink=img_hyperlink))
1300
+
1301
+ if isinstance(figure, Tag):
1302
+ caption_tag = figure.find("figcaption", recursive=False)
1303
+ if isinstance(caption_tag, Tag):
1304
+ caption = self._extract_text_and_hyperlink_recursively(
1305
+ caption_tag, find_parent_annotation=True
1306
+ )
1307
+ if not caption and img_tag.get("alt"):
1308
+ caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
1309
+
1310
+ caption_anno_text = caption.to_single_text_element()
1311
+
1312
+ caption_item: Optional[TextItem] = None
1313
+ if caption_anno_text.text:
1314
+ text_clean = HTMLDocumentBackend._clean_unicode(
1315
+ caption_anno_text.text.strip()
1316
+ )
1317
+ caption_item = doc.add_text(
1318
+ label=DocItemLabel.CAPTION,
1319
+ text=text_clean,
1320
+ orig=caption_anno_text.text,
1321
+ content_layer=self.content_layer,
1322
+ formatting=caption_anno_text.formatting,
1323
+ hyperlink=caption_anno_text.hyperlink,
1324
+ )
1325
+
1326
+ src_loc: str = self._get_attr_as_string(img_tag, "src")
1327
+ if not cast(HTMLBackendOptions, self.options).fetch_images or not src_loc:
1328
+ # Do not fetch the image, just add a placeholder
1329
+ placeholder: PictureItem = doc.add_picture(
1330
+ caption=caption_item,
1331
+ parent=parent,
1332
+ content_layer=self.content_layer,
1333
+ )
1334
+ return placeholder.get_ref()
1335
+
1336
+ src_loc = self._resolve_relative_path(src_loc)
1337
+ img_ref = self._create_image_ref(src_loc)
1338
+
1339
+ docling_pic = doc.add_picture(
1340
+ image=img_ref,
1341
+ caption=caption_item,
1342
+ parent=parent,
1343
+ content_layer=self.content_layer,
1344
+ )
1345
+ return docling_pic.get_ref()
1346
+
1347
+ def _create_image_ref(self, src_url: str) -> Optional[ImageRef]:
1348
+ try:
1349
+ img_data = self._load_image_data(src_url)
1350
+ if img_data:
1351
+ img = Image.open(BytesIO(img_data))
1352
+ return ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
1353
+ except (
1354
+ requests.HTTPError,
1355
+ ValidationError,
1356
+ UnidentifiedImageError,
1357
+ OperationNotAllowed,
1358
+ TypeError,
1359
+ ValueError,
1360
+ ) as e:
1361
+ warnings.warn(f"Could not process an image from {src_url}: {e}")
1362
+
1363
+ return None
1364
+
1365
+ def _load_image_data(self, src_loc: str) -> Optional[bytes]:
1366
+ if src_loc.lower().endswith(".svg"):
1367
+ _log.debug(f"Skipping SVG file: {src_loc}")
1368
+ return None
1369
+
1370
+ if HTMLDocumentBackend._is_remote_url(src_loc):
1371
+ if not self.options.enable_remote_fetch:
1372
+ raise OperationNotAllowed(
1373
+ "Fetching remote resources is only allowed when set explicitly. "
1374
+ "Set options.enable_remote_fetch=True."
1375
+ )
1376
+ response = requests.get(src_loc, stream=True)
1377
+ response.raise_for_status()
1378
+ return response.content
1379
+ elif src_loc.startswith("data:"):
1380
+ data = re.sub(r"^data:image/.+;base64,", "", src_loc)
1381
+ return base64.b64decode(data)
1382
+
1383
+ if src_loc.startswith("file://"):
1384
+ src_loc = src_loc[7:]
1385
+
1386
+ if not self.options.enable_local_fetch:
1387
+ raise OperationNotAllowed(
1388
+ "Fetching local resources is only allowed when set explicitly. "
1389
+ "Set options.enable_local_fetch=True."
1390
+ )
1391
+ # add check that file exists and can read
1392
+ if os.path.isfile(src_loc) and os.access(src_loc, os.R_OK):
1393
+ with open(src_loc, "rb") as f:
1394
+ return f.read()
1395
+ else:
1396
+ raise ValueError("File does not exist or it is not readable.")
1397
+
1398
+ @staticmethod
1399
+ def get_text(item: PageElement) -> str:
1400
+ """Concatenate all child strings of a PageElement.
1401
+
1402
+ This method is equivalent to `PageElement.get_text()` but also considers
1403
+ certain tags. When called on a <p> or <li> tags, it returns the text with a
1404
+ trailing space, otherwise the text is concatenated without separators.
1405
+ """
1406
+
1407
+ def _extract_text_recursively(item: PageElement) -> list[str]:
1408
+ """Recursively extract text from all child nodes."""
1409
+ result: list[str] = []
1410
+
1411
+ if isinstance(item, NavigableString):
1412
+ result = [item]
1413
+ elif isinstance(item, Tag):
1414
+ tag = cast(Tag, item)
1415
+ parts: list[str] = []
1416
+ for child in tag:
1417
+ parts.extend(_extract_text_recursively(child))
1418
+ result.append(
1419
+ "".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts)
1420
+ )
1421
+
1422
+ return result
1423
+
1424
+ parts: list[str] = _extract_text_recursively(item)
1425
+
1426
+ return "".join(parts)
1427
+
1428
+ @staticmethod
1429
+ def _clean_unicode(text: str) -> str:
1430
+ """Replace typical Unicode characters in HTML for text processing.
1431
+
1432
+ Several Unicode characters (e.g., non-printable or formatting) are typically
1433
+ found in HTML but are worth replacing to sanitize text and ensure consistency
1434
+ in text processing tasks.
1435
+
1436
+ Args:
1437
+ text: The original text.
1438
+
1439
+ Returns:
1440
+ The sanitized text without typical Unicode characters.
1441
+ """
1442
+ replacements = {
1443
+ "\u00a0": " ", # non-breaking space
1444
+ "\u200b": "", # zero-width space
1445
+ "\u200c": "", # zero-width non-joiner
1446
+ "\u200d": "", # zero-width joiner
1447
+ "\u2010": "-", # hyphen
1448
+ "\u2011": "-", # non-breaking hyphen
1449
+ "\u2012": "-", # dash
1450
+ "\u2013": "-", # dash
1451
+ "\u2014": "-", # dash
1452
+ "\u2015": "-", # horizontal bar
1453
+ "\u2018": "'", # left single quotation mark
1454
+ "\u2019": "'", # right single quotation mark
1455
+ "\u201c": '"', # left double quotation mark
1456
+ "\u201d": '"', # right double quotation mark
1457
+ "\u2026": "...", # ellipsis
1458
+ "\u00ad": "", # soft hyphen
1459
+ "\ufeff": "", # zero width non-break space
1460
+ "\u202f": " ", # narrow non-break space
1461
+ "\u2060": "", # word joiner
1462
+ }
1463
+ for raw, clean in replacements.items():
1464
+ text = text.replace(raw, clean)
1465
+
1466
+ return text
1467
+
1468
+ @staticmethod
1469
+ def _get_cell_spans(cell: Tag) -> tuple[int, int]:
1470
+ """Extract colspan and rowspan values from a table cell tag.
1471
+
1472
+ This function retrieves the 'colspan' and 'rowspan' attributes from a given
1473
+ table cell tag.
1474
+ If the attribute does not exist or it is not numeric, it defaults to 1.
1475
+ """
1476
+ raw_spans: tuple[str, str] = (
1477
+ str(cell.get("colspan", "1")),
1478
+ str(cell.get("rowspan", "1")),
1479
+ )
1480
+
1481
+ def _extract_num(s: str) -> int:
1482
+ if s and s[0].isnumeric():
1483
+ match = re.search(r"\d+", s)
1484
+ if match:
1485
+ return int(match.group())
1486
+ return 1
1487
+
1488
+ int_spans: tuple[int, int] = (
1489
+ _extract_num(raw_spans[0]),
1490
+ _extract_num(raw_spans[1]),
1491
+ )
1492
+
1493
+ return int_spans
1494
+
1495
+ @staticmethod
1496
+ def _get_attr_as_string(tag: Tag, attr: str, default: str = "") -> str:
1497
+ """Get attribute value as string, handling list values."""
1498
+ value = tag.get(attr)
1499
+ if not value:
1500
+ return default
1501
+
1502
+ return value[0] if isinstance(value, list) else value