docling 2.42.0__py3-none-any.whl → 2.42.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,11 @@
1
1
  import logging
2
+ import re
2
3
  import traceback
3
4
  from io import BytesIO
4
5
  from pathlib import Path
5
6
  from typing import Final, Optional, Union, cast
6
7
 
7
- from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
8
+ from bs4 import BeautifulSoup, NavigableString, Tag
8
9
  from bs4.element import PreformattedString
9
10
  from docling_core.types.doc import (
10
11
  DocItem,
@@ -15,6 +16,7 @@ from docling_core.types.doc import (
15
16
  GroupLabel,
16
17
  TableCell,
17
18
  TableData,
19
+ TextItem,
18
20
  )
19
21
  from docling_core.types.doc.document import ContentLayer
20
22
  from pydantic import BaseModel
@@ -26,10 +28,14 @@ from docling.datamodel.document import InputDocument
26
28
 
27
29
  _log = logging.getLogger(__name__)
28
30
 
29
- # tags that generate NodeItem elements
30
- TAGS_FOR_NODE_ITEMS: Final = [
31
+ DEFAULT_IMAGE_WIDTH = 128
32
+ DEFAULT_IMAGE_HEIGHT = 128
33
+
34
+ # Tags that initiate distinct Docling items
35
+ _BLOCK_TAGS: Final = {
31
36
  "address",
32
37
  "details",
38
+ "figure",
33
39
  "h1",
34
40
  "h2",
35
41
  "h3",
@@ -41,12 +47,9 @@ TAGS_FOR_NODE_ITEMS: Final = [
41
47
  "code",
42
48
  "ul",
43
49
  "ol",
44
- "li",
45
50
  "summary",
46
51
  "table",
47
- "figure",
48
- "img",
49
- ]
52
+ }
50
53
 
51
54
 
52
55
  class _Context(BaseModel):
@@ -56,12 +59,16 @@ class _Context(BaseModel):
56
59
 
57
60
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
58
61
  @override
59
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
62
+ def __init__(
63
+ self,
64
+ in_doc: InputDocument,
65
+ path_or_stream: Union[BytesIO, Path],
66
+ ):
60
67
  super().__init__(in_doc, path_or_stream)
61
68
  self.soup: Optional[Tag] = None
62
- # HTML file:
63
69
  self.path_or_stream = path_or_stream
64
- # Initialise the parents for the hierarchy
70
+
71
+ # Initialize the parents for the hierarchy
65
72
  self.max_levels = 10
66
73
  self.level = 0
67
74
  self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
@@ -70,13 +77,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
70
77
  self.parents[i] = None
71
78
 
72
79
  try:
73
- if isinstance(self.path_or_stream, BytesIO):
74
- text_stream = self.path_or_stream.getvalue()
75
- self.soup = BeautifulSoup(text_stream, "html.parser")
76
- if isinstance(self.path_or_stream, Path):
77
- with open(self.path_or_stream, "rb") as f:
78
- html_content = f.read()
79
- self.soup = BeautifulSoup(html_content, "html.parser")
80
+ raw = (
81
+ path_or_stream.getvalue()
82
+ if isinstance(path_or_stream, BytesIO)
83
+ else Path(path_or_stream).read_bytes()
84
+ )
85
+ self.soup = BeautifulSoup(raw, "html.parser")
80
86
  except Exception as e:
81
87
  raise RuntimeError(
82
88
  "Could not initialize HTML backend for file with "
@@ -96,7 +102,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
96
102
  def unload(self):
97
103
  if isinstance(self.path_or_stream, BytesIO):
98
104
  self.path_or_stream.close()
99
-
100
105
  self.path_or_stream = None
101
106
 
102
107
  @classmethod
@@ -106,211 +111,156 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
106
111
 
107
112
  @override
108
113
  def convert(self) -> DoclingDocument:
109
- # access self.path_or_stream to load stuff
114
+ _log.debug("Starting HTML conversion...")
115
+ if not self.is_valid():
116
+ raise RuntimeError("Invalid HTML document.")
117
+
110
118
  origin = DocumentOrigin(
111
119
  filename=self.file.name or "file",
112
120
  mimetype="text/html",
113
121
  binary_hash=self.document_hash,
114
122
  )
115
-
116
123
  doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
117
- _log.debug("Trying to convert HTML...")
118
-
119
- if self.is_valid():
120
- assert self.soup is not None
121
- content = self.soup.body or self.soup
122
- # Replace <br> tags with newline characters
123
- # TODO: remove style to avoid losing text from tags like i, b, span, ...
124
- for br in content("br"):
125
- br.replace_with(NavigableString("\n"))
126
-
127
- headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
128
- self.content_layer = (
129
- ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
130
- )
131
- self.ctx = _Context() # reset context
132
- self.walk(content, doc)
133
- else:
134
- raise RuntimeError(
135
- f"Cannot convert doc with {self.document_hash} because the backend "
136
- "failed to init."
137
- )
138
- return doc
139
-
140
- def walk(self, tag: Tag, doc: DoclingDocument) -> None:
141
- # Iterate over elements in the body of the document
142
- text: str = ""
143
- for element in tag.children:
144
- if isinstance(element, Tag):
145
- try:
146
- self.analyze_tag(cast(Tag, element), doc)
147
- except Exception as exc_child:
148
- _log.error(
149
- f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
150
- )
151
- raise exc_child
152
- elif isinstance(element, NavigableString) and not isinstance(
153
- element, PreformattedString
154
- ):
155
- # Floating text outside paragraphs or analyzed tags
156
- text += element
157
- siblings: list[Tag] = [
158
- item for item in element.next_siblings if isinstance(item, Tag)
159
- ]
160
- if element.next_sibling is None or any(
161
- item.name in TAGS_FOR_NODE_ITEMS for item in siblings
162
- ):
163
- text = text.strip()
164
- if text and tag.name in ["div"]:
165
- doc.add_text(
166
- parent=self.parents[self.level],
167
- label=DocItemLabel.TEXT,
168
- text=text,
169
- content_layer=self.content_layer,
170
- )
171
- text = ""
172
-
173
- return
174
-
175
- def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
176
- if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
177
- self.handle_header(tag, doc)
178
- elif tag.name in ["p", "address", "summary"]:
179
- self.handle_paragraph(tag, doc)
180
- elif tag.name in ["pre", "code"]:
181
- self.handle_code(tag, doc)
182
- elif tag.name in ["ul", "ol"]:
183
- self.handle_list(tag, doc)
184
- elif tag.name in ["li"]:
185
- self.handle_list_item(tag, doc)
186
- elif tag.name == "table":
187
- self.handle_table(tag, doc)
188
- elif tag.name == "figure":
189
- self.handle_figure(tag, doc)
190
- elif tag.name == "img":
191
- self.handle_image(tag, doc)
192
- elif tag.name == "details":
193
- self.handle_details(tag, doc)
194
- else:
195
- self.walk(tag, doc)
196
124
 
197
- def get_text(self, item: PageElement) -> str:
198
- """Get the text content of a tag."""
199
- parts: list[str] = self.extract_text_recursively(item)
200
-
201
- return "".join(parts) + " "
202
-
203
- # Function to recursively extract text from all child nodes
204
- def extract_text_recursively(self, item: PageElement) -> list[str]:
205
- result: list[str] = []
125
+ assert self.soup is not None
126
+ # set the title as furniture, since it is part of the document metadata
127
+ title = self.soup.title
128
+ if title:
129
+ doc.add_title(
130
+ text=title.get_text(separator=" ", strip=True),
131
+ content_layer=ContentLayer.FURNITURE,
132
+ )
133
+ # remove scripts/styles
134
+ for tag in self.soup(["script", "style"]):
135
+ tag.decompose()
136
+ content = self.soup.body or self.soup
137
+ # normalize <br> tags
138
+ for br in content("br"):
139
+ br.replace_with(NavigableString("\n"))
140
+ # set default content layer
141
+ headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
142
+ self.content_layer = (
143
+ ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
144
+ )
145
+ # reset context
146
+ self.ctx = _Context()
206
147
 
207
- if isinstance(item, NavigableString):
208
- return [item]
148
+ try:
149
+ self._walk(content, doc)
150
+ except Exception:
151
+ print(traceback.format_exc())
209
152
 
210
- tag = cast(Tag, item)
211
- if tag.name not in ["ul", "ol"]:
212
- for child in tag:
213
- # Recursively get the child's text content
214
- result.extend(self.extract_text_recursively(child))
153
+ return doc
215
154
 
216
- return ["".join(result) + " "]
155
+ def _walk(self, element: Tag, doc: DoclingDocument) -> None:
156
+ """Parse an XML tag by recursively walking its content.
217
157
 
218
- def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
219
- """Handle details tag (details) and its content."""
158
+ While walking, the method buffers inline text across tags like <b> or <span>,
159
+ emitting text nodes only at block boundaries.
220
160
 
221
- self.parents[self.level + 1] = doc.add_group(
222
- name="details",
223
- label=GroupLabel.SECTION,
224
- parent=self.parents[self.level],
225
- content_layer=self.content_layer,
226
- )
161
+ Args:
162
+ element: The XML tag to parse.
163
+ doc: The Docling document to be updated with the parsed content.
164
+ """
165
+ buffer: list[str] = []
166
+
167
+ def flush_buffer():
168
+ if not buffer:
169
+ return
170
+ text = "".join(buffer).strip()
171
+ buffer.clear()
172
+ if not text:
173
+ return
174
+ for part in text.split("\n"):
175
+ seg = part.strip()
176
+ if seg:
177
+ doc.add_text(
178
+ DocItemLabel.TEXT,
179
+ seg,
180
+ parent=self.parents[self.level],
181
+ content_layer=self.content_layer,
182
+ )
227
183
 
228
- self.level += 1
229
- self.walk(element, doc)
230
- self.parents[self.level + 1] = None
231
- self.level -= 1
184
+ for node in element.contents:
185
+ if isinstance(node, Tag):
186
+ name = node.name.lower()
187
+ if name == "img":
188
+ flush_buffer()
189
+ self._emit_image(node, doc)
190
+ elif name in _BLOCK_TAGS:
191
+ flush_buffer()
192
+ self._handle_block(node, doc)
193
+ elif node.find(_BLOCK_TAGS):
194
+ flush_buffer()
195
+ self._walk(node, doc)
196
+ else:
197
+ buffer.append(node.text)
198
+ elif isinstance(node, NavigableString) and not isinstance(
199
+ node, PreformattedString
200
+ ):
201
+ buffer.append(str(node))
232
202
 
233
- def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
234
- """Handles header tags (h1, h2, etc.)."""
235
- hlevel = int(element.name.replace("h", ""))
236
- text = element.text.strip()
203
+ flush_buffer()
237
204
 
205
+ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
206
+ tag_name = tag.name.lower()
207
+ # set default content layer to BODY as soon as we encounter a heading
238
208
  self.content_layer = ContentLayer.BODY
239
-
240
- if hlevel == 1:
209
+ level = int(tag_name[1])
210
+ text = tag.get_text(strip=True, separator=" ")
211
+ # the first level is for the title item
212
+ if level == 1:
241
213
  for key in self.parents.keys():
242
214
  self.parents[key] = None
243
-
244
- self.level = 1
245
- self.parents[self.level] = doc.add_text(
246
- parent=self.parents[0],
247
- label=DocItemLabel.TITLE,
248
- text=text,
249
- content_layer=self.content_layer,
215
+ self.level = 0
216
+ self.parents[self.level + 1] = doc.add_title(
217
+ text, content_layer=self.content_layer
250
218
  )
219
+ # the other levels need to be lowered by 1 if a title was set
251
220
  else:
252
- if hlevel > self.level:
221
+ level -= 1
222
+ if level > self.level:
253
223
  # add invisible group
254
- for i in range(self.level + 1, hlevel):
255
- self.parents[i] = doc.add_group(
256
- name=f"header-{i}",
224
+ for i in range(self.level, level):
225
+ _log.debug(f"Adding invisible group to level {i}")
226
+ self.parents[i + 1] = doc.add_group(
227
+ name=f"header-{i + 1}",
257
228
  label=GroupLabel.SECTION,
258
- parent=self.parents[i - 1],
229
+ parent=self.parents[i],
259
230
  content_layer=self.content_layer,
260
231
  )
261
- self.level = hlevel
262
-
263
- elif hlevel < self.level:
232
+ self.level = level
233
+ elif level < self.level:
264
234
  # remove the tail
265
235
  for key in self.parents.keys():
266
- if key > hlevel:
236
+ if key > level + 1:
237
+ _log.debug(f"Remove the tail of level {key}")
267
238
  self.parents[key] = None
268
- self.level = hlevel
269
-
270
- self.parents[hlevel] = doc.add_heading(
271
- parent=self.parents[hlevel - 1],
272
- text=text,
273
- level=hlevel - 1,
274
- content_layer=self.content_layer,
275
- )
276
-
277
- def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
278
- """Handles monospace code snippets (pre)."""
279
- if element.text is None:
280
- return
281
- text = element.text.strip()
282
- if text:
283
- doc.add_code(
284
- parent=self.parents[self.level],
285
- text=text,
286
- content_layer=self.content_layer,
287
- )
288
-
289
- def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
290
- """Handles paragraph tags (p) or equivalent ones."""
291
- if element.text is None:
292
- return
293
- text = element.text.strip()
294
- if text:
295
- doc.add_text(
239
+ self.level = level
240
+ self.parents[self.level + 1] = doc.add_heading(
296
241
  parent=self.parents[self.level],
297
- label=DocItemLabel.TEXT,
298
242
  text=text,
243
+ level=self.level,
299
244
  content_layer=self.content_layer,
300
245
  )
246
+ self.level += 1
247
+ for img_tag in tag("img"):
248
+ if isinstance(img_tag, Tag):
249
+ self._emit_image(img_tag, doc)
301
250
 
302
- def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
303
- """Handles list tags (ul, ol) and their list items."""
304
-
251
+ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> None:
252
+ tag_name = tag.name.lower()
305
253
  start: Optional[int] = None
306
- if is_ordered := element.name == "ol":
307
- start_attr = element.get("start")
254
+ name: str = ""
255
+ is_ordered = tag_name == "ol"
256
+ if is_ordered:
257
+ start_attr = tag.get("start")
308
258
  if isinstance(start_attr, str) and start_attr.isnumeric():
309
259
  start = int(start_attr)
310
260
  name = "ordered list" + (f" start {start}" if start is not None else "")
311
261
  else:
312
262
  name = "list"
313
- # create a list group
263
+ # Create the list container
314
264
  list_group = doc.add_list_group(
315
265
  name=name,
316
266
  parent=self.parents[self.level],
@@ -320,64 +270,152 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
320
270
  self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
321
271
  if is_ordered and start is not None:
322
272
  self.ctx.list_start_by_ref[list_group.self_ref] = start
323
-
324
273
  self.level += 1
325
274
 
326
- self.walk(element, doc)
275
+ # For each top-level <li> in this list
276
+ for li in tag.find_all({"li", "ul", "ol"}, recursive=False):
277
+ if not isinstance(li, Tag):
278
+ continue
279
+
280
+ # sub-list items should be indented under main list items, but temporarily
281
+ # addressing invalid HTML (docling-core/issues/357)
282
+ if li.name in {"ul", "ol"}:
283
+ self._handle_block(li, doc)
284
+
285
+ else:
286
+ # 1) determine the marker
287
+ if is_ordered and start is not None:
288
+ marker = f"{start + len(list_group.children)}."
289
+ else:
290
+ marker = ""
291
+
292
+ # 2) extract only the "direct" text from this <li>
293
+ parts: list[str] = []
294
+ for child in li.contents:
295
+ if isinstance(child, NavigableString) and not isinstance(
296
+ child, PreformattedString
297
+ ):
298
+ parts.append(child)
299
+ elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
300
+ text_part = child.get_text()
301
+ if text_part:
302
+ parts.append(text_part)
303
+ li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
304
+
305
+ # 3) add the list item
306
+ if li_text:
307
+ self.parents[self.level + 1] = doc.add_list_item(
308
+ text=li_text,
309
+ enumerated=is_ordered,
310
+ marker=marker,
311
+ parent=list_group,
312
+ content_layer=self.content_layer,
313
+ )
314
+
315
+ # 4) recurse into any nested lists, attaching them to this <li> item
316
+ for sublist in li({"ul", "ol"}, recursive=False):
317
+ if isinstance(sublist, Tag):
318
+ self.level += 1
319
+ self._handle_block(sublist, doc)
320
+ self.parents[self.level + 1] = None
321
+ self.level -= 1
322
+ else:
323
+ for sublist in li({"ul", "ol"}, recursive=False):
324
+ if isinstance(sublist, Tag):
325
+ self._handle_block(sublist, doc)
326
+
327
+ # 5) extract any images under this <li>
328
+ for img_tag in li("img"):
329
+ if isinstance(img_tag, Tag):
330
+ self._emit_image(img_tag, doc)
327
331
 
328
332
  self.parents[self.level + 1] = None
329
333
  self.level -= 1
330
334
 
331
- def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
332
- """Handles list item tags (li)."""
333
- nested_list = element.find(["ul", "ol"])
334
-
335
- parent = self.parents[self.level]
336
- if parent is None:
337
- _log.debug(f"list-item has no parent in DoclingDocument: {element}")
338
- return
339
- enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False)
340
- if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)):
341
- marker = f"{start + len(parent.children)}."
342
- else:
343
- marker = ""
344
-
345
- if nested_list:
346
- # Text in list item can be hidden within hierarchy, hence
347
- # we need to extract it recursively
348
- text: str = self.get_text(element)
349
- # Flatten text, remove break lines:
350
- text = text.replace("\n", "").replace("\r", "")
351
- text = " ".join(text.split()).strip()
352
-
353
- if len(text) > 0:
354
- # create a list-item
355
- self.parents[self.level + 1] = doc.add_list_item(
356
- text=text,
357
- enumerated=enumerated,
358
- marker=marker,
359
- parent=parent,
335
+ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> None:
336
+ tag_name = tag.name.lower()
337
+
338
+ if tag_name == "figure":
339
+ img_tag = tag.find("img")
340
+ if isinstance(img_tag, Tag):
341
+ self._emit_image(img_tag, doc)
342
+
343
+ elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
344
+ self._handle_heading(tag, doc)
345
+
346
+ elif tag_name in {"ul", "ol"}:
347
+ self._handle_list(tag, doc)
348
+
349
+ elif tag_name in {"p", "address", "summary"}:
350
+ for part in tag.text.split("\n"):
351
+ seg = part.strip()
352
+ if seg:
353
+ doc.add_text(
354
+ parent=self.parents[self.level],
355
+ label=DocItemLabel.TEXT,
356
+ text=seg,
357
+ content_layer=self.content_layer,
358
+ )
359
+ for img_tag in tag("img"):
360
+ if isinstance(img_tag, Tag):
361
+ self._emit_image(img_tag, doc)
362
+
363
+ elif tag_name == "table":
364
+ data = HTMLDocumentBackend.parse_table_data(tag)
365
+ for img_tag in tag("img"):
366
+ if isinstance(img_tag, Tag):
367
+ self._emit_image(tag, doc)
368
+ if data is not None:
369
+ doc.add_table(
370
+ data=data,
371
+ parent=self.parents[self.level],
360
372
  content_layer=self.content_layer,
361
373
  )
362
- self.level += 1
363
- self.walk(element, doc)
364
- self.parents[self.level + 1] = None
365
- self.level -= 1
366
- else:
367
- self.walk(element, doc)
368
374
 
369
- elif element.text.strip():
370
- text = element.text.strip()
375
+ elif tag_name in {"pre", "code"}:
376
+ # handle monospace code snippets (pre).
377
+ text = tag.get_text(strip=True)
378
+ if text:
379
+ doc.add_code(
380
+ parent=self.parents[self.level],
381
+ text=text,
382
+ content_layer=self.content_layer,
383
+ )
371
384
 
372
- doc.add_list_item(
373
- text=text,
374
- enumerated=enumerated,
375
- marker=marker,
376
- parent=parent,
385
+ elif tag_name == "details":
386
+ # handle details and its content.
387
+ self.parents[self.level + 1] = doc.add_group(
388
+ name="details",
389
+ label=GroupLabel.SECTION,
390
+ parent=self.parents[self.level],
377
391
  content_layer=self.content_layer,
378
392
  )
379
- else:
380
- _log.debug(f"list-item has no text: {element}")
393
+ self.level += 1
394
+ self._walk(tag, doc)
395
+ self.parents[self.level + 1] = None
396
+ self.level -= 1
397
+
398
+ def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
399
+ figure = img_tag.find_parent("figure")
400
+ caption: str = ""
401
+ if isinstance(figure, Tag):
402
+ caption_tag = figure.find("figcaption", recursive=False)
403
+ if isinstance(caption_tag, Tag):
404
+ caption = caption_tag.get_text()
405
+ if not caption:
406
+ caption = str(img_tag.get("alt", "")).strip()
407
+
408
+ caption_item: Optional[TextItem] = None
409
+ if caption:
410
+ caption_item = doc.add_text(
411
+ DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
412
+ )
413
+
414
+ doc.add_picture(
415
+ caption=caption_item,
416
+ parent=self.parents[self.level],
417
+ content_layer=self.content_layer,
418
+ )
381
419
 
382
420
  @staticmethod
383
421
  def _get_cell_spans(cell: Tag) -> tuple[int, int]:
@@ -502,84 +540,3 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
502
540
  data.table_cells.append(table_cell)
503
541
 
504
542
  return data
505
-
506
- def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
507
- """Handles table tags."""
508
-
509
- table_data = HTMLDocumentBackend.parse_table_data(element)
510
-
511
- if table_data is not None:
512
- doc.add_table(
513
- data=table_data,
514
- parent=self.parents[self.level],
515
- content_layer=self.content_layer,
516
- )
517
-
518
- def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
519
- """Recursively extract text from <ul> or <ol> with proper indentation."""
520
- result = []
521
- bullet_char = "*" # Default bullet character for unordered lists
522
-
523
- if list_element.name == "ol": # For ordered lists, use numbers
524
- for i, li in enumerate(list_element("li", recursive=False), 1):
525
- if not isinstance(li, Tag):
526
- continue
527
- # Add numbering for ordered lists
528
- result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
529
- # Handle nested lists
530
- nested_list = li.find(["ul", "ol"])
531
- if isinstance(nested_list, Tag):
532
- result.extend(self.get_list_text(nested_list, level + 1))
533
- elif list_element.name == "ul": # For unordered lists, use bullet points
534
- for li in list_element("li", recursive=False):
535
- if not isinstance(li, Tag):
536
- continue
537
- # Add bullet points for unordered lists
538
- result.append(
539
- f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
540
- )
541
- # Handle nested lists
542
- nested_list = li.find(["ul", "ol"])
543
- if isinstance(nested_list, Tag):
544
- result.extend(self.get_list_text(nested_list, level + 1))
545
-
546
- return result
547
-
548
- def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
549
- """Handles image tags (img)."""
550
-
551
- # Extract the image URI from the <img> tag
552
- # image_uri = root.xpath('//figure//img/@src')[0]
553
-
554
- contains_captions = element.find(["figcaption"])
555
- if not isinstance(contains_captions, Tag):
556
- doc.add_picture(
557
- parent=self.parents[self.level],
558
- caption=None,
559
- content_layer=self.content_layer,
560
- )
561
- else:
562
- texts = []
563
- for item in contains_captions:
564
- texts.append(item.text)
565
-
566
- fig_caption = doc.add_text(
567
- label=DocItemLabel.CAPTION,
568
- text=("".join(texts)).strip(),
569
- content_layer=self.content_layer,
570
- )
571
- doc.add_picture(
572
- parent=self.parents[self.level],
573
- caption=fig_caption,
574
- content_layer=self.content_layer,
575
- )
576
-
577
- def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
578
- """Handles image tags (img)."""
579
- _log.debug(f"ignoring <img> tags at the moment: {element}")
580
-
581
- doc.add_picture(
582
- parent=self.parents[self.level],
583
- caption=None,
584
- content_layer=self.content_layer,
585
- )
@@ -267,9 +267,14 @@ class LayoutPostprocessor:
267
267
  # Initial cell assignment
268
268
  clusters = self._assign_cells_to_clusters(clusters)
269
269
 
270
- # Remove clusters with no cells (if keep_empty_clusters is False)
270
+ # Remove clusters with no cells (if keep_empty_clusters is False),
271
+ # but always keep clusters with label DocItemLabel.FORMULA
271
272
  if not self.options.keep_empty_clusters:
272
- clusters = [cluster for cluster in clusters if cluster.cells]
273
+ clusters = [
274
+ cluster
275
+ for cluster in clusters
276
+ if cluster.cells or cluster.label == DocItemLabel.FORMULA
277
+ ]
273
278
 
274
279
  # Handle orphaned cells
275
280
  unassigned = self._find_unassigned_cells(clusters)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.42.0
3
+ Version: 2.42.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -9,7 +9,7 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
9
9
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
10
10
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
11
11
  docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
12
- docling/backend/html_backend.py,sha256=xyCbCGR3vYNl-wSP2YJRgSCy9kIIMKKu28AUylPEUq8,20959
12
+ docling/backend/html_backend.py,sha256=gGkm3i7FpW2WCJ-_GPpOJNh1LUq1_-vRGyGURuPagck,19284
13
13
  docling/backend/md_backend.py,sha256=mfwGj8g2hGC-Q_HREtl_Web65uMVXD-Ie1nRqWTXzF0,21013
14
14
  docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
15
15
  docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
@@ -83,7 +83,7 @@ docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zY
83
83
  docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
84
84
  docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
85
85
  docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
86
- docling/utils/layout_postprocessor.py,sha256=m92UKjL-cIrOmOBi5Nuiby9FQWFyudcHigJKzud69-Q,24486
86
+ docling/utils/layout_postprocessor.py,sha256=LFLbBE-o3kWu79d8ZcyHlZPIqzQfCabZCIPTJ51lZsY,24657
87
87
  docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
88
88
  docling/utils/model_downloader.py,sha256=3vijCsAIVwWqehGBDRxRq7mJ3yRb9-zBsG00iqjqegU,4076
89
89
  docling/utils/ocr_utils.py,sha256=nmresYyfin0raanpQc_GGeU3WoLsfExf6SEXNIQ7Djg,2325
@@ -91,9 +91,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
91
91
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
92
92
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
93
93
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
94
- docling-2.42.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
95
- docling-2.42.0.dist-info/METADATA,sha256=jOwKrV5DDscuvMqHevJKC7-VA_hPOpDNz2lfJA6RAVE,10310
96
- docling-2.42.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
97
- docling-2.42.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
98
- docling-2.42.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
99
- docling-2.42.0.dist-info/RECORD,,
94
+ docling-2.42.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
95
+ docling-2.42.1.dist-info/METADATA,sha256=d46NOPDEps6dVLLMh3tWBCEQv7b_bwQQ46ndyqVO-ag,10310
96
+ docling-2.42.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
97
+ docling-2.42.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
98
+ docling-2.42.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
99
+ docling-2.42.1.dist-info/RECORD,,