docling 2.29.0__tar.gz → 2.31.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {docling-2.29.0 → docling-2.31.0}/PKG-INFO +4 -3
  2. {docling-2.29.0 → docling-2.31.0}/README.md +1 -0
  3. {docling-2.29.0 → docling-2.31.0}/docling/backend/asciidoc_backend.py +7 -15
  4. {docling-2.29.0 → docling-2.31.0}/docling/backend/csv_backend.py +1 -1
  5. {docling-2.29.0 → docling-2.31.0}/docling/backend/docling_parse_backend.py +2 -2
  6. {docling-2.29.0 → docling-2.31.0}/docling/backend/docling_parse_v2_backend.py +2 -2
  7. {docling-2.29.0 → docling-2.31.0}/docling/backend/docling_parse_v4_backend.py +3 -4
  8. {docling-2.29.0 → docling-2.31.0}/docling/backend/docx/latex/latex_dict.py +0 -5
  9. {docling-2.29.0 → docling-2.31.0}/docling/backend/docx/latex/omml.py +4 -7
  10. {docling-2.29.0 → docling-2.31.0}/docling/backend/html_backend.py +26 -9
  11. {docling-2.29.0 → docling-2.31.0}/docling/backend/md_backend.py +5 -7
  12. docling-2.31.0/docling/backend/msexcel_backend.py +519 -0
  13. {docling-2.29.0 → docling-2.31.0}/docling/backend/mspowerpoint_backend.py +4 -7
  14. {docling-2.29.0 → docling-2.31.0}/docling/backend/msword_backend.py +23 -15
  15. {docling-2.29.0 → docling-2.31.0}/docling/backend/pdf_backend.py +2 -1
  16. {docling-2.29.0 → docling-2.31.0}/docling/backend/pypdfium2_backend.py +3 -3
  17. {docling-2.29.0 → docling-2.31.0}/docling/backend/xml/jats_backend.py +10 -13
  18. {docling-2.29.0 → docling-2.31.0}/docling/backend/xml/uspto_backend.py +15 -19
  19. {docling-2.29.0 → docling-2.31.0}/docling/cli/main.py +27 -9
  20. {docling-2.29.0 → docling-2.31.0}/docling/cli/models.py +2 -3
  21. {docling-2.29.0 → docling-2.31.0}/docling/datamodel/base_models.py +40 -5
  22. {docling-2.29.0 → docling-2.31.0}/docling/datamodel/document.py +18 -10
  23. {docling-2.29.0 → docling-2.31.0}/docling/datamodel/pipeline_options.py +29 -4
  24. {docling-2.29.0 → docling-2.31.0}/docling/document_converter.py +5 -5
  25. docling-2.31.0/docling/models/api_vlm_model.py +66 -0
  26. {docling-2.29.0 → docling-2.31.0}/docling/models/base_model.py +2 -4
  27. {docling-2.29.0 → docling-2.31.0}/docling/models/base_ocr_model.py +2 -2
  28. {docling-2.29.0 → docling-2.31.0}/docling/models/code_formula_model.py +2 -1
  29. {docling-2.29.0 → docling-2.31.0}/docling/models/document_picture_classifier.py +2 -1
  30. {docling-2.29.0 → docling-2.31.0}/docling/models/easyocr_model.py +10 -11
  31. {docling-2.29.0 → docling-2.31.0}/docling/models/factories/__init__.py +2 -2
  32. {docling-2.29.0 → docling-2.31.0}/docling/models/factories/base_factory.py +1 -1
  33. {docling-2.29.0 → docling-2.31.0}/docling/models/hf_mlx_model.py +4 -6
  34. {docling-2.29.0 → docling-2.31.0}/docling/models/hf_vlm_model.py +7 -5
  35. {docling-2.29.0 → docling-2.31.0}/docling/models/layout_model.py +2 -2
  36. {docling-2.29.0 → docling-2.31.0}/docling/models/ocr_mac_model.py +3 -4
  37. {docling-2.29.0 → docling-2.31.0}/docling/models/page_assemble_model.py +7 -12
  38. {docling-2.29.0 → docling-2.31.0}/docling/models/page_preprocessing_model.py +2 -1
  39. docling-2.31.0/docling/models/picture_description_api_model.py +59 -0
  40. {docling-2.29.0 → docling-2.31.0}/docling/models/picture_description_base_model.py +16 -5
  41. {docling-2.29.0 → docling-2.31.0}/docling/models/picture_description_vlm_model.py +2 -3
  42. {docling-2.29.0 → docling-2.31.0}/docling/models/rapid_ocr_model.py +2 -3
  43. {docling-2.29.0 → docling-2.31.0}/docling/models/readingorder_model.py +8 -23
  44. {docling-2.29.0 → docling-2.31.0}/docling/models/table_structure_model.py +2 -6
  45. {docling-2.29.0 → docling-2.31.0}/docling/models/tesseract_ocr_cli_model.py +17 -16
  46. {docling-2.29.0 → docling-2.31.0}/docling/models/tesseract_ocr_model.py +8 -6
  47. {docling-2.29.0 → docling-2.31.0}/docling/pipeline/base_pipeline.py +4 -8
  48. {docling-2.29.0 → docling-2.31.0}/docling/pipeline/simple_pipeline.py +0 -1
  49. {docling-2.29.0 → docling-2.31.0}/docling/pipeline/standard_pdf_pipeline.py +6 -3
  50. {docling-2.29.0 → docling-2.31.0}/docling/pipeline/vlm_pipeline.py +27 -20
  51. docling-2.31.0/docling/utils/api_image_request.py +61 -0
  52. {docling-2.29.0 → docling-2.31.0}/docling/utils/export.py +2 -4
  53. {docling-2.29.0 → docling-2.31.0}/docling/utils/glm_utils.py +2 -2
  54. {docling-2.29.0 → docling-2.31.0}/docling/utils/layout_postprocessor.py +4 -2
  55. {docling-2.29.0 → docling-2.31.0}/docling/utils/model_downloader.py +7 -7
  56. {docling-2.29.0 → docling-2.31.0}/docling/utils/utils.py +1 -1
  57. {docling-2.29.0 → docling-2.31.0}/pyproject.toml +80 -14
  58. docling-2.29.0/docling/backend/msexcel_backend.py +0 -343
  59. docling-2.29.0/docling/models/picture_description_api_model.py +0 -125
  60. {docling-2.29.0 → docling-2.31.0}/LICENSE +0 -0
  61. {docling-2.29.0 → docling-2.31.0}/docling/__init__.py +0 -0
  62. {docling-2.29.0 → docling-2.31.0}/docling/backend/__init__.py +0 -0
  63. {docling-2.29.0 → docling-2.31.0}/docling/backend/abstract_backend.py +0 -0
  64. {docling-2.29.0 → docling-2.31.0}/docling/backend/docx/__init__.py +0 -0
  65. {docling-2.29.0 → docling-2.31.0}/docling/backend/docx/latex/__init__.py +0 -0
  66. {docling-2.29.0 → docling-2.31.0}/docling/backend/json/__init__.py +0 -0
  67. {docling-2.29.0 → docling-2.31.0}/docling/backend/json/docling_json_backend.py +0 -0
  68. {docling-2.29.0 → docling-2.31.0}/docling/backend/xml/__init__.py +0 -0
  69. {docling-2.29.0 → docling-2.31.0}/docling/chunking/__init__.py +0 -0
  70. {docling-2.29.0 → docling-2.31.0}/docling/cli/__init__.py +0 -0
  71. {docling-2.29.0 → docling-2.31.0}/docling/cli/tools.py +0 -0
  72. {docling-2.29.0 → docling-2.31.0}/docling/datamodel/__init__.py +0 -0
  73. {docling-2.29.0 → docling-2.31.0}/docling/datamodel/settings.py +0 -0
  74. {docling-2.29.0 → docling-2.31.0}/docling/exceptions.py +0 -0
  75. {docling-2.29.0 → docling-2.31.0}/docling/models/__init__.py +0 -0
  76. {docling-2.29.0 → docling-2.31.0}/docling/models/factories/ocr_factory.py +0 -0
  77. {docling-2.29.0 → docling-2.31.0}/docling/models/factories/picture_description_factory.py +0 -0
  78. {docling-2.29.0 → docling-2.31.0}/docling/models/plugins/__init__.py +0 -0
  79. {docling-2.29.0 → docling-2.31.0}/docling/models/plugins/defaults.py +0 -0
  80. {docling-2.29.0 → docling-2.31.0}/docling/pipeline/__init__.py +0 -0
  81. {docling-2.29.0 → docling-2.31.0}/docling/py.typed +0 -0
  82. {docling-2.29.0 → docling-2.31.0}/docling/utils/__init__.py +0 -0
  83. {docling-2.29.0 → docling-2.31.0}/docling/utils/accelerator_utils.py +0 -0
  84. {docling-2.29.0 → docling-2.31.0}/docling/utils/locks.py +0 -0
  85. {docling-2.29.0 → docling-2.31.0}/docling/utils/ocr_utils.py +0 -0
  86. {docling-2.29.0 → docling-2.31.0}/docling/utils/profiling.py +0 -0
  87. {docling-2.29.0 → docling-2.31.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.29.0
3
+ Version: 2.31.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
- Requires-Dist: docling-core[chunking] (>=2.24.1,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
33
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -58,7 +58,7 @@ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
58
58
  Requires-Dist: tqdm (>=4.65.0,<5.0.0)
59
59
  Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
60
60
  Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
61
- Requires-Dist: typer (>=0.12.5,<0.13.0)
61
+ Requires-Dist: typer (>=0.12.5,<0.16.0)
62
62
  Project-URL: Repository, https://github.com/docling-project/docling
63
63
  Description-Content-Type: text/markdown
64
64
 
@@ -86,6 +86,7 @@ Description-Content-Type: text/markdown
86
86
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
87
87
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
88
88
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
89
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
89
90
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
90
91
 
91
92
  Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
@@ -22,6 +22,7 @@
22
22
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
23
23
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
24
24
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
25
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
25
26
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
26
27
 
27
28
  Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
@@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
34
34
  text_stream = self.path_or_stream.getvalue().decode("utf-8")
35
35
  self.lines = text_stream.split("\n")
36
36
  if isinstance(self.path_or_stream, Path):
37
- with open(self.path_or_stream, "r", encoding="utf-8") as f:
37
+ with open(self.path_or_stream, encoding="utf-8") as f:
38
38
  self.lines = f.readlines()
39
39
  self.valid = True
40
40
 
@@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
75
75
 
76
76
  return doc
77
77
 
78
- def _parse(self, doc: DoclingDocument):
78
+ def _parse(self, doc: DoclingDocument): # noqa: C901
79
79
  """
80
80
  Main function that orchestrates the parsing by yielding components:
81
81
  title, section headers, text, lists, and tables.
82
82
  """
83
83
 
84
- content = ""
85
-
86
84
  in_list = False
87
85
  in_table = False
88
86
 
@@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
95
93
  # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
96
94
  indents: dict[int, Union[GroupItem, None]] = {}
97
95
 
98
- for i in range(0, 10):
96
+ for i in range(10):
99
97
  parents[i] = None
100
98
  indents[i] = None
101
99
 
@@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
125
123
 
126
124
  # Lists
127
125
  elif self._is_list_item(line):
128
-
129
126
  _log.debug(f"line: {line}")
130
127
  item = self._parse_list_item(line)
131
128
  _log.debug(f"parsed list-item: {item}")
@@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
147
144
  indents[level + 1] = item["indent"]
148
145
 
149
146
  elif in_list and item["indent"] < indents[level]:
150
-
151
147
  # print(item["indent"], " => ", indents[level])
152
148
  while item["indent"] < indents[level]:
153
149
  # print(item["indent"], " => ", indents[level])
@@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
176
172
  elif in_table and (
177
173
  (not self._is_table_line(line)) or line.strip() == "|==="
178
174
  ): # end of table
179
-
180
175
  caption = None
181
176
  if len(caption_data) > 0:
182
177
  caption = doc.add_text(
@@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
195
190
 
196
191
  # Picture
197
192
  elif self._is_picture(line):
198
-
199
193
  caption = None
200
194
  if len(caption_data) > 0:
201
195
  caption = doc.add_text(
@@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
250
244
  text_data = []
251
245
 
252
246
  elif len(line.strip()) > 0: # allow multiline texts
253
-
254
247
  item = self._parse_text(line)
255
248
  text_data.append(item["text"])
256
249
 
@@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
273
266
 
274
267
  def _get_current_level(self, parents):
275
268
  for k, v in parents.items():
276
- if v == None and k > 0:
269
+ if v is None and k > 0:
277
270
  return k - 1
278
271
 
279
272
  return 0
280
273
 
281
274
  def _get_current_parent(self, parents):
282
275
  for k, v in parents.items():
283
- if v == None and k > 0:
276
+ if v is None and k > 0:
284
277
  return parents[k - 1]
285
278
 
286
279
  return None
@@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
328
321
  "marker": marker,
329
322
  "text": text.strip(),
330
323
  "numbered": False,
331
- "indent": 0 if indent == None else len(indent),
324
+ "indent": 0 if indent is None else len(indent),
332
325
  }
333
326
  else:
334
327
  return {
@@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
336
329
  "marker": marker,
337
330
  "text": text.strip(),
338
331
  "numbered": True,
339
- "indent": 0 if indent == None else len(indent),
332
+ "indent": 0 if indent is None else len(indent),
340
333
  }
341
334
  else:
342
335
  # Fallback if no match
@@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
357
350
  return [cell.strip() for cell in line.split("|") if cell.strip()]
358
351
 
359
352
  def _populate_table_as_grid(self, table_data):
360
-
361
353
  num_rows = len(table_data)
362
354
 
363
355
  # Adjust the table data into a grid format
@@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
58
58
  head = self.content.readline()
59
59
  dialect = csv.Sniffer().sniff(head, ",;\t|:")
60
60
  _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
61
- if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
61
+ if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
62
62
  raise RuntimeError(
63
63
  f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
64
64
  )
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import random
3
+ from collections.abc import Iterable
3
4
  from io import BytesIO
4
5
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Union
6
+ from typing import List, Optional, Union
6
7
 
7
8
  import pypdfium2 as pdfium
8
9
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
@@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
156
157
  def get_page_image(
157
158
  self, scale: float = 1, cropbox: Optional[BoundingBox] = None
158
159
  ) -> Image.Image:
159
-
160
160
  page_size = self.get_size()
161
161
 
162
162
  if not cropbox:
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import random
3
+ from collections.abc import Iterable
3
4
  from io import BytesIO
4
5
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
+ from typing import TYPE_CHECKING, List, Optional, Union
6
7
 
7
8
  import pypdfium2 as pdfium
8
9
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
172
173
  def get_page_image(
173
174
  self, scale: float = 1, cropbox: Optional[BoundingBox] = None
174
175
  ) -> Image.Image:
175
-
176
176
  page_size = self.get_size()
177
177
 
178
178
  if not cropbox:
@@ -1,14 +1,14 @@
1
1
  import logging
2
- import random
2
+ from collections.abc import Iterable
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Iterable, List, Optional, Union
5
+ from typing import TYPE_CHECKING, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
9
  from docling_core.types.doc.page import SegmentedPdfPage, TextCell
10
10
  from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
11
- from PIL import Image, ImageDraw
11
+ from PIL import Image
12
12
  from pypdfium2 import PdfPage
13
13
 
14
14
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
@@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
93
93
  def get_page_image(
94
94
  self, scale: float = 1, cropbox: Optional[BoundingBox] = None
95
95
  ) -> Image.Image:
96
-
97
96
  page_size = self.get_size()
98
97
 
99
98
  if not cropbox:
@@ -1,12 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """
4
2
  Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
5
3
  On 23/01/2025
6
4
  """
7
5
 
8
- from __future__ import unicode_literals
9
-
10
6
  CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
11
7
 
12
8
  BLANK = ""
@@ -79,7 +75,6 @@ CHR_BO = {
79
75
  }
80
76
 
81
77
  T = {
82
- "\u2192": "\\rightarrow ",
83
78
  # Greek letters
84
79
  "\U0001d6fc": "\\alpha ",
85
80
  "\U0001d6fd": "\\beta ",
@@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
76
76
  return default
77
77
 
78
78
 
79
- class Tag2Method(object):
80
-
79
+ class Tag2Method:
81
80
  def call_method(self, elm, stag=None):
82
81
  getmethod = self.tag2meth.get
83
82
  if stag is None:
@@ -130,7 +129,6 @@ class Tag2Method(object):
130
129
 
131
130
 
132
131
  class Pr(Tag2Method):
133
-
134
132
  text = ""
135
133
 
136
134
  __val_tags = ("chr", "pos", "begChr", "endChr", "type")
@@ -159,7 +157,7 @@ class Pr(Tag2Method):
159
157
  def do_common(self, elm):
160
158
  stag = elm.tag.replace(OMML_NS, "")
161
159
  if stag in self.__val_tags:
162
- t = elm.get("{0}val".format(OMML_NS))
160
+ t = elm.get(f"{OMML_NS}val")
163
161
  self.__innerdict[stag] = t
164
162
  return None
165
163
 
@@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
248
246
  """
249
247
  the Pre-Sub-Superscript object -- Not support yet
250
248
  """
251
- pass
252
249
 
253
250
  def do_sub(self, elm):
254
251
  text = self.process_children(elm)
@@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
331
328
  t_dict = self.process_children_dict(elm, include=("e", "lim"))
332
329
  latex_s = LIM_FUNC.get(t_dict["e"])
333
330
  if not latex_s:
334
- raise NotSupport("Not support lim %s" % t_dict["e"])
331
+ raise RuntimeError("Not support lim {}".format(t_dict["e"]))
335
332
  else:
336
333
  return latex_s.format(lim=t_dict.get("lim"))
337
334
 
@@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
413
410
  """
414
411
  _str = []
415
412
  _base_str = []
416
- found_text = elm.findtext("./{0}t".format(OMML_NS))
413
+ found_text = elm.findtext(f"./{OMML_NS}t")
417
414
  if found_text:
418
415
  for s in found_text:
419
416
  out_latex_str = self.process_unicode(s)
@@ -26,6 +26,8 @@ _log = logging.getLogger(__name__)
26
26
 
27
27
  # tags that generate NodeItem elements
28
28
  TAGS_FOR_NODE_ITEMS: Final = [
29
+ "address",
30
+ "details",
29
31
  "h1",
30
32
  "h2",
31
33
  "h3",
@@ -38,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
38
40
  "ul",
39
41
  "ol",
40
42
  "li",
43
+ "summary",
41
44
  "table",
42
45
  "figure",
43
46
  "img",
@@ -55,7 +58,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
55
58
  self.max_levels = 10
56
59
  self.level = 0
57
60
  self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
58
- for i in range(0, self.max_levels):
61
+ for i in range(self.max_levels):
59
62
  self.parents[i] = None
60
63
 
61
64
  try:
@@ -126,7 +129,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
126
129
  return doc
127
130
 
128
131
  def walk(self, tag: Tag, doc: DoclingDocument) -> None:
129
-
130
132
  # Iterate over elements in the body of the document
131
133
  text: str = ""
132
134
  for element in tag.children:
@@ -135,7 +137,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
135
137
  self.analyze_tag(cast(Tag, element), doc)
136
138
  except Exception as exc_child:
137
139
  _log.error(
138
- f"Error processing child from tag {tag.name}: {repr(exc_child)}"
140
+ f"Error processing child from tag {tag.name}: {exc_child!r}"
139
141
  )
140
142
  raise exc_child
141
143
  elif isinstance(element, NavigableString) and not isinstance(
@@ -147,7 +149,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
147
149
  item for item in element.next_siblings if isinstance(item, Tag)
148
150
  ]
149
151
  if element.next_sibling is None or any(
150
- [item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
152
+ item.name in TAGS_FOR_NODE_ITEMS for item in siblings
151
153
  ):
152
154
  text = text.strip()
153
155
  if text and tag.name in ["div"]:
@@ -164,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
164
166
  def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
165
167
  if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
166
168
  self.handle_header(tag, doc)
167
- elif tag.name in ["p"]:
169
+ elif tag.name in ["p", "address", "summary"]:
168
170
  self.handle_paragraph(tag, doc)
169
171
  elif tag.name in ["pre", "code"]:
170
172
  self.handle_code(tag, doc)
@@ -178,6 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
178
180
  self.handle_figure(tag, doc)
179
181
  elif tag.name == "img":
180
182
  self.handle_image(tag, doc)
183
+ elif tag.name == "details":
184
+ self.handle_details(tag, doc)
181
185
  else:
182
186
  self.walk(tag, doc)
183
187
 
@@ -202,6 +206,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
202
206
 
203
207
  return ["".join(result) + " "]
204
208
 
209
+ def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
210
+ """Handle details tag (details) and its content."""
211
+
212
+ self.parents[self.level + 1] = doc.add_group(
213
+ name="details",
214
+ label=GroupLabel.SECTION,
215
+ parent=self.parents[self.level],
216
+ content_layer=self.content_layer,
217
+ )
218
+
219
+ self.level += 1
220
+ self.walk(element, doc)
221
+ self.parents[self.level + 1] = None
222
+ self.level -= 1
223
+
205
224
  def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
206
225
  """Handles header tags (h1, h2, etc.)."""
207
226
  hlevel = int(element.name.replace("h", ""))
@@ -222,7 +241,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
222
241
  )
223
242
  else:
224
243
  if hlevel > self.level:
225
-
226
244
  # add invisible group
227
245
  for i in range(self.level + 1, hlevel):
228
246
  self.parents[i] = doc.add_group(
@@ -234,7 +252,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
234
252
  self.level = hlevel
235
253
 
236
254
  elif hlevel < self.level:
237
-
238
255
  # remove the tail
239
256
  for key in self.parents.keys():
240
257
  if key > hlevel:
@@ -261,7 +278,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
261
278
  )
262
279
 
263
280
  def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
264
- """Handles paragraph tags (p)."""
281
+ """Handles paragraph tags (p) or equivalent ones."""
265
282
  if element.text is None:
266
283
  return
267
284
  text = element.text.strip()
@@ -360,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
360
377
  marker = ""
361
378
  enumerated = False
362
379
  if parent_label == GroupLabel.ORDERED_LIST:
363
- marker = f"{str(index_in_list)}."
380
+ marker = f"{index_in_list!s}."
364
381
  enumerated = True
365
382
  doc.add_list_item(
366
383
  text=text,
@@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
83
83
  # otherwise they represent emphasis (bold or italic)
84
84
  self.markdown = self._shorten_underscore_sequences(text_stream)
85
85
  if isinstance(self.path_or_stream, Path):
86
- with open(self.path_or_stream, "r", encoding="utf-8") as f:
86
+ with open(self.path_or_stream, encoding="utf-8") as f:
87
87
  md_content = f.read()
88
88
  # remove invalid sequences
89
89
  # very long sequences of underscores will lead to unnecessary long processing times.
@@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
168
168
  )
169
169
  self.inline_texts = []
170
170
 
171
- def _iterate_elements(
171
+ def _iterate_elements( # noqa: C901
172
172
  self,
173
173
  element: marko.element.Element,
174
174
  depth: int,
@@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
176
176
  visited: Set[marko.element.Element],
177
177
  parent_item: Optional[NodeItem] = None,
178
178
  ):
179
-
180
179
  if element in visited:
181
180
  return
182
181
 
@@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
236
235
  if has_non_empty_list_items:
237
236
  label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
238
237
  parent_item = doc.add_group(
239
- label=label, name=f"list", parent=parent_item
238
+ label=label, name="list", parent=parent_item
240
239
  )
241
240
 
242
241
  elif (
@@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
320
319
  self._html_blocks += 1
321
320
  self._process_inline_text(parent_item, doc)
322
321
  self._close_table(doc)
323
- _log.debug("HTML Block: {}".format(element))
322
+ _log.debug(f"HTML Block: {element}")
324
323
  if (
325
324
  len(element.body) > 0
326
325
  ): # If Marko doesn't return any content for HTML block, skip it
@@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
332
331
  else:
333
332
  if not isinstance(element, str):
334
333
  self._close_table(doc)
335
- _log.debug("Some other element: {}".format(element))
334
+ _log.debug(f"Some other element: {element}")
336
335
 
337
336
  processed_block_types = (
338
337
  marko.block.Heading,
@@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
398
397
 
399
398
  # if HTML blocks were detected, export to HTML and delegate to HTML backend
400
399
  if self._html_blocks > 0:
401
-
402
400
  # export to HTML
403
401
  html_backend_cls = HTMLDocumentBackend
404
402
  html_str = doc.export_to_html()