docling 2.41.0__py3-none-any.whl → 2.42.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -260,7 +260,15 @@ class oMath2Latex(Tag2Method):
260
260
  the fraction object
261
261
  """
262
262
  c_dict = self.process_children_dict(elm)
263
- pr = c_dict["fPr"]
263
+ pr = c_dict.get("fPr")
264
+ if pr is None:
265
+ # Handle missing fPr element gracefully
266
+ _log.debug("Missing fPr element in fraction, using default formatting")
267
+ latex_s = F_DEFAULT
268
+ return latex_s.format(
269
+ num=c_dict.get("num"),
270
+ den=c_dict.get("den"),
271
+ )
264
272
  latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
265
273
  return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
266
274
 
@@ -1,10 +1,11 @@
1
1
  import logging
2
+ import re
2
3
  import traceback
3
4
  from io import BytesIO
4
5
  from pathlib import Path
5
6
  from typing import Final, Optional, Union, cast
6
7
 
7
- from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
8
+ from bs4 import BeautifulSoup, NavigableString, Tag
8
9
  from bs4.element import PreformattedString
9
10
  from docling_core.types.doc import (
10
11
  DocItem,
@@ -15,6 +16,7 @@ from docling_core.types.doc import (
15
16
  GroupLabel,
16
17
  TableCell,
17
18
  TableData,
19
+ TextItem,
18
20
  )
19
21
  from docling_core.types.doc.document import ContentLayer
20
22
  from pydantic import BaseModel
@@ -26,10 +28,14 @@ from docling.datamodel.document import InputDocument
26
28
 
27
29
  _log = logging.getLogger(__name__)
28
30
 
29
- # tags that generate NodeItem elements
30
- TAGS_FOR_NODE_ITEMS: Final = [
31
+ DEFAULT_IMAGE_WIDTH = 128
32
+ DEFAULT_IMAGE_HEIGHT = 128
33
+
34
+ # Tags that initiate distinct Docling items
35
+ _BLOCK_TAGS: Final = {
31
36
  "address",
32
37
  "details",
38
+ "figure",
33
39
  "h1",
34
40
  "h2",
35
41
  "h3",
@@ -41,12 +47,9 @@ TAGS_FOR_NODE_ITEMS: Final = [
41
47
  "code",
42
48
  "ul",
43
49
  "ol",
44
- "li",
45
50
  "summary",
46
51
  "table",
47
- "figure",
48
- "img",
49
- ]
52
+ }
50
53
 
51
54
 
52
55
  class _Context(BaseModel):
@@ -56,12 +59,16 @@ class _Context(BaseModel):
56
59
 
57
60
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
58
61
  @override
59
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
62
+ def __init__(
63
+ self,
64
+ in_doc: InputDocument,
65
+ path_or_stream: Union[BytesIO, Path],
66
+ ):
60
67
  super().__init__(in_doc, path_or_stream)
61
68
  self.soup: Optional[Tag] = None
62
- # HTML file:
63
69
  self.path_or_stream = path_or_stream
64
- # Initialise the parents for the hierarchy
70
+
71
+ # Initialize the parents for the hierarchy
65
72
  self.max_levels = 10
66
73
  self.level = 0
67
74
  self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
@@ -70,13 +77,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
70
77
  self.parents[i] = None
71
78
 
72
79
  try:
73
- if isinstance(self.path_or_stream, BytesIO):
74
- text_stream = self.path_or_stream.getvalue()
75
- self.soup = BeautifulSoup(text_stream, "html.parser")
76
- if isinstance(self.path_or_stream, Path):
77
- with open(self.path_or_stream, "rb") as f:
78
- html_content = f.read()
79
- self.soup = BeautifulSoup(html_content, "html.parser")
80
+ raw = (
81
+ path_or_stream.getvalue()
82
+ if isinstance(path_or_stream, BytesIO)
83
+ else Path(path_or_stream).read_bytes()
84
+ )
85
+ self.soup = BeautifulSoup(raw, "html.parser")
80
86
  except Exception as e:
81
87
  raise RuntimeError(
82
88
  "Could not initialize HTML backend for file with "
@@ -96,7 +102,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
96
102
  def unload(self):
97
103
  if isinstance(self.path_or_stream, BytesIO):
98
104
  self.path_or_stream.close()
99
-
100
105
  self.path_or_stream = None
101
106
 
102
107
  @classmethod
@@ -106,211 +111,156 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
106
111
 
107
112
  @override
108
113
  def convert(self) -> DoclingDocument:
109
- # access self.path_or_stream to load stuff
114
+ _log.debug("Starting HTML conversion...")
115
+ if not self.is_valid():
116
+ raise RuntimeError("Invalid HTML document.")
117
+
110
118
  origin = DocumentOrigin(
111
119
  filename=self.file.name or "file",
112
120
  mimetype="text/html",
113
121
  binary_hash=self.document_hash,
114
122
  )
115
-
116
123
  doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
117
- _log.debug("Trying to convert HTML...")
118
-
119
- if self.is_valid():
120
- assert self.soup is not None
121
- content = self.soup.body or self.soup
122
- # Replace <br> tags with newline characters
123
- # TODO: remove style to avoid losing text from tags like i, b, span, ...
124
- for br in content("br"):
125
- br.replace_with(NavigableString("\n"))
126
-
127
- headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
128
- self.content_layer = (
129
- ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
130
- )
131
- self.ctx = _Context() # reset context
132
- self.walk(content, doc)
133
- else:
134
- raise RuntimeError(
135
- f"Cannot convert doc with {self.document_hash} because the backend "
136
- "failed to init."
137
- )
138
- return doc
139
-
140
- def walk(self, tag: Tag, doc: DoclingDocument) -> None:
141
- # Iterate over elements in the body of the document
142
- text: str = ""
143
- for element in tag.children:
144
- if isinstance(element, Tag):
145
- try:
146
- self.analyze_tag(cast(Tag, element), doc)
147
- except Exception as exc_child:
148
- _log.error(
149
- f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
150
- )
151
- raise exc_child
152
- elif isinstance(element, NavigableString) and not isinstance(
153
- element, PreformattedString
154
- ):
155
- # Floating text outside paragraphs or analyzed tags
156
- text += element
157
- siblings: list[Tag] = [
158
- item for item in element.next_siblings if isinstance(item, Tag)
159
- ]
160
- if element.next_sibling is None or any(
161
- item.name in TAGS_FOR_NODE_ITEMS for item in siblings
162
- ):
163
- text = text.strip()
164
- if text and tag.name in ["div"]:
165
- doc.add_text(
166
- parent=self.parents[self.level],
167
- label=DocItemLabel.TEXT,
168
- text=text,
169
- content_layer=self.content_layer,
170
- )
171
- text = ""
172
-
173
- return
174
-
175
- def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
176
- if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
177
- self.handle_header(tag, doc)
178
- elif tag.name in ["p", "address", "summary"]:
179
- self.handle_paragraph(tag, doc)
180
- elif tag.name in ["pre", "code"]:
181
- self.handle_code(tag, doc)
182
- elif tag.name in ["ul", "ol"]:
183
- self.handle_list(tag, doc)
184
- elif tag.name in ["li"]:
185
- self.handle_list_item(tag, doc)
186
- elif tag.name == "table":
187
- self.handle_table(tag, doc)
188
- elif tag.name == "figure":
189
- self.handle_figure(tag, doc)
190
- elif tag.name == "img":
191
- self.handle_image(tag, doc)
192
- elif tag.name == "details":
193
- self.handle_details(tag, doc)
194
- else:
195
- self.walk(tag, doc)
196
-
197
- def get_text(self, item: PageElement) -> str:
198
- """Get the text content of a tag."""
199
- parts: list[str] = self.extract_text_recursively(item)
200
124
 
201
- return "".join(parts) + " "
202
-
203
- # Function to recursively extract text from all child nodes
204
- def extract_text_recursively(self, item: PageElement) -> list[str]:
205
- result: list[str] = []
206
-
207
- if isinstance(item, NavigableString):
208
- return [item]
209
-
210
- tag = cast(Tag, item)
211
- if tag.name not in ["ul", "ol"]:
212
- for child in tag:
213
- # Recursively get the child's text content
214
- result.extend(self.extract_text_recursively(child))
125
+ assert self.soup is not None
126
+ # set the title as furniture, since it is part of the document metadata
127
+ title = self.soup.title
128
+ if title:
129
+ doc.add_title(
130
+ text=title.get_text(separator=" ", strip=True),
131
+ content_layer=ContentLayer.FURNITURE,
132
+ )
133
+ # remove scripts/styles
134
+ for tag in self.soup(["script", "style"]):
135
+ tag.decompose()
136
+ content = self.soup.body or self.soup
137
+ # normalize <br> tags
138
+ for br in content("br"):
139
+ br.replace_with(NavigableString("\n"))
140
+ # set default content layer
141
+ headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
142
+ self.content_layer = (
143
+ ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
144
+ )
145
+ # reset context
146
+ self.ctx = _Context()
215
147
 
216
- return ["".join(result) + " "]
148
+ try:
149
+ self._walk(content, doc)
150
+ except Exception:
151
+ print(traceback.format_exc())
217
152
 
218
- def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
219
- """Handle details tag (details) and its content."""
153
+ return doc
220
154
 
221
- self.parents[self.level + 1] = doc.add_group(
222
- name="details",
223
- label=GroupLabel.SECTION,
224
- parent=self.parents[self.level],
225
- content_layer=self.content_layer,
226
- )
155
+ def _walk(self, element: Tag, doc: DoclingDocument) -> None:
156
+ """Parse an XML tag by recursively walking its content.
157
+
158
+ While walking, the method buffers inline text across tags like <b> or <span>,
159
+ emitting text nodes only at block boundaries.
160
+
161
+ Args:
162
+ element: The XML tag to parse.
163
+ doc: The Docling document to be updated with the parsed content.
164
+ """
165
+ buffer: list[str] = []
166
+
167
+ def flush_buffer():
168
+ if not buffer:
169
+ return
170
+ text = "".join(buffer).strip()
171
+ buffer.clear()
172
+ if not text:
173
+ return
174
+ for part in text.split("\n"):
175
+ seg = part.strip()
176
+ if seg:
177
+ doc.add_text(
178
+ DocItemLabel.TEXT,
179
+ seg,
180
+ parent=self.parents[self.level],
181
+ content_layer=self.content_layer,
182
+ )
227
183
 
228
- self.level += 1
229
- self.walk(element, doc)
230
- self.parents[self.level + 1] = None
231
- self.level -= 1
184
+ for node in element.contents:
185
+ if isinstance(node, Tag):
186
+ name = node.name.lower()
187
+ if name == "img":
188
+ flush_buffer()
189
+ self._emit_image(node, doc)
190
+ elif name in _BLOCK_TAGS:
191
+ flush_buffer()
192
+ self._handle_block(node, doc)
193
+ elif node.find(_BLOCK_TAGS):
194
+ flush_buffer()
195
+ self._walk(node, doc)
196
+ else:
197
+ buffer.append(node.text)
198
+ elif isinstance(node, NavigableString) and not isinstance(
199
+ node, PreformattedString
200
+ ):
201
+ buffer.append(str(node))
232
202
 
233
- def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
234
- """Handles header tags (h1, h2, etc.)."""
235
- hlevel = int(element.name.replace("h", ""))
236
- text = element.text.strip()
203
+ flush_buffer()
237
204
 
205
+ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
206
+ tag_name = tag.name.lower()
207
+ # set default content layer to BODY as soon as we encounter a heading
238
208
  self.content_layer = ContentLayer.BODY
239
-
240
- if hlevel == 1:
209
+ level = int(tag_name[1])
210
+ text = tag.get_text(strip=True, separator=" ")
211
+ # the first level is for the title item
212
+ if level == 1:
241
213
  for key in self.parents.keys():
242
214
  self.parents[key] = None
243
-
244
- self.level = 1
245
- self.parents[self.level] = doc.add_text(
246
- parent=self.parents[0],
247
- label=DocItemLabel.TITLE,
248
- text=text,
249
- content_layer=self.content_layer,
215
+ self.level = 0
216
+ self.parents[self.level + 1] = doc.add_title(
217
+ text, content_layer=self.content_layer
250
218
  )
219
+ # the other levels need to be lowered by 1 if a title was set
251
220
  else:
252
- if hlevel > self.level:
221
+ level -= 1
222
+ if level > self.level:
253
223
  # add invisible group
254
- for i in range(self.level + 1, hlevel):
255
- self.parents[i] = doc.add_group(
256
- name=f"header-{i}",
224
+ for i in range(self.level, level):
225
+ _log.debug(f"Adding invisible group to level {i}")
226
+ self.parents[i + 1] = doc.add_group(
227
+ name=f"header-{i + 1}",
257
228
  label=GroupLabel.SECTION,
258
- parent=self.parents[i - 1],
229
+ parent=self.parents[i],
259
230
  content_layer=self.content_layer,
260
231
  )
261
- self.level = hlevel
262
-
263
- elif hlevel < self.level:
232
+ self.level = level
233
+ elif level < self.level:
264
234
  # remove the tail
265
235
  for key in self.parents.keys():
266
- if key > hlevel:
236
+ if key > level + 1:
237
+ _log.debug(f"Remove the tail of level {key}")
267
238
  self.parents[key] = None
268
- self.level = hlevel
269
-
270
- self.parents[hlevel] = doc.add_heading(
271
- parent=self.parents[hlevel - 1],
272
- text=text,
273
- level=hlevel - 1,
274
- content_layer=self.content_layer,
275
- )
276
-
277
- def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
278
- """Handles monospace code snippets (pre)."""
279
- if element.text is None:
280
- return
281
- text = element.text.strip()
282
- if text:
283
- doc.add_code(
284
- parent=self.parents[self.level],
285
- text=text,
286
- content_layer=self.content_layer,
287
- )
288
-
289
- def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
290
- """Handles paragraph tags (p) or equivalent ones."""
291
- if element.text is None:
292
- return
293
- text = element.text.strip()
294
- if text:
295
- doc.add_text(
239
+ self.level = level
240
+ self.parents[self.level + 1] = doc.add_heading(
296
241
  parent=self.parents[self.level],
297
- label=DocItemLabel.TEXT,
298
242
  text=text,
243
+ level=self.level,
299
244
  content_layer=self.content_layer,
300
245
  )
246
+ self.level += 1
247
+ for img_tag in tag("img"):
248
+ if isinstance(img_tag, Tag):
249
+ self._emit_image(img_tag, doc)
301
250
 
302
- def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
303
- """Handles list tags (ul, ol) and their list items."""
304
-
251
+ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> None:
252
+ tag_name = tag.name.lower()
305
253
  start: Optional[int] = None
306
- if is_ordered := element.name == "ol":
307
- start_attr = element.get("start")
254
+ name: str = ""
255
+ is_ordered = tag_name == "ol"
256
+ if is_ordered:
257
+ start_attr = tag.get("start")
308
258
  if isinstance(start_attr, str) and start_attr.isnumeric():
309
259
  start = int(start_attr)
310
260
  name = "ordered list" + (f" start {start}" if start is not None else "")
311
261
  else:
312
262
  name = "list"
313
- # create a list group
263
+ # Create the list container
314
264
  list_group = doc.add_list_group(
315
265
  name=name,
316
266
  parent=self.parents[self.level],
@@ -320,64 +270,171 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
320
270
  self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
321
271
  if is_ordered and start is not None:
322
272
  self.ctx.list_start_by_ref[list_group.self_ref] = start
323
-
324
273
  self.level += 1
325
274
 
326
- self.walk(element, doc)
275
+ # For each top-level <li> in this list
276
+ for li in tag.find_all({"li", "ul", "ol"}, recursive=False):
277
+ if not isinstance(li, Tag):
278
+ continue
279
+
280
+ # sub-list items should be indented under main list items, but temporarily
281
+ # addressing invalid HTML (docling-core/issues/357)
282
+ if li.name in {"ul", "ol"}:
283
+ self._handle_block(li, doc)
284
+
285
+ else:
286
+ # 1) determine the marker
287
+ if is_ordered and start is not None:
288
+ marker = f"{start + len(list_group.children)}."
289
+ else:
290
+ marker = ""
291
+
292
+ # 2) extract only the "direct" text from this <li>
293
+ parts: list[str] = []
294
+ for child in li.contents:
295
+ if isinstance(child, NavigableString) and not isinstance(
296
+ child, PreformattedString
297
+ ):
298
+ parts.append(child)
299
+ elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
300
+ text_part = child.get_text()
301
+ if text_part:
302
+ parts.append(text_part)
303
+ li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
304
+
305
+ # 3) add the list item
306
+ if li_text:
307
+ self.parents[self.level + 1] = doc.add_list_item(
308
+ text=li_text,
309
+ enumerated=is_ordered,
310
+ marker=marker,
311
+ parent=list_group,
312
+ content_layer=self.content_layer,
313
+ )
314
+
315
+ # 4) recurse into any nested lists, attaching them to this <li> item
316
+ for sublist in li({"ul", "ol"}, recursive=False):
317
+ if isinstance(sublist, Tag):
318
+ self.level += 1
319
+ self._handle_block(sublist, doc)
320
+ self.parents[self.level + 1] = None
321
+ self.level -= 1
322
+ else:
323
+ for sublist in li({"ul", "ol"}, recursive=False):
324
+ if isinstance(sublist, Tag):
325
+ self._handle_block(sublist, doc)
326
+
327
+ # 5) extract any images under this <li>
328
+ for img_tag in li("img"):
329
+ if isinstance(img_tag, Tag):
330
+ self._emit_image(img_tag, doc)
327
331
 
328
332
  self.parents[self.level + 1] = None
329
333
  self.level -= 1
330
334
 
331
- def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
332
- """Handles list item tags (li)."""
333
- nested_list = element.find(["ul", "ol"])
334
-
335
- parent = self.parents[self.level]
336
- if parent is None:
337
- _log.debug(f"list-item has no parent in DoclingDocument: {element}")
338
- return
339
- enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False)
340
- if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)):
341
- marker = f"{start + len(parent.children)}."
342
- else:
343
- marker = ""
344
-
345
- if nested_list:
346
- # Text in list item can be hidden within hierarchy, hence
347
- # we need to extract it recursively
348
- text: str = self.get_text(element)
349
- # Flatten text, remove break lines:
350
- text = text.replace("\n", "").replace("\r", "")
351
- text = " ".join(text.split()).strip()
352
-
353
- if len(text) > 0:
354
- # create a list-item
355
- self.parents[self.level + 1] = doc.add_list_item(
356
- text=text,
357
- enumerated=enumerated,
358
- marker=marker,
359
- parent=parent,
335
+ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> None:
336
+ tag_name = tag.name.lower()
337
+
338
+ if tag_name == "figure":
339
+ img_tag = tag.find("img")
340
+ if isinstance(img_tag, Tag):
341
+ self._emit_image(img_tag, doc)
342
+
343
+ elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
344
+ self._handle_heading(tag, doc)
345
+
346
+ elif tag_name in {"ul", "ol"}:
347
+ self._handle_list(tag, doc)
348
+
349
+ elif tag_name in {"p", "address", "summary"}:
350
+ for part in tag.text.split("\n"):
351
+ seg = part.strip()
352
+ if seg:
353
+ doc.add_text(
354
+ parent=self.parents[self.level],
355
+ label=DocItemLabel.TEXT,
356
+ text=seg,
357
+ content_layer=self.content_layer,
358
+ )
359
+ for img_tag in tag("img"):
360
+ if isinstance(img_tag, Tag):
361
+ self._emit_image(img_tag, doc)
362
+
363
+ elif tag_name == "table":
364
+ data = HTMLDocumentBackend.parse_table_data(tag)
365
+ for img_tag in tag("img"):
366
+ if isinstance(img_tag, Tag):
367
+ self._emit_image(tag, doc)
368
+ if data is not None:
369
+ doc.add_table(
370
+ data=data,
371
+ parent=self.parents[self.level],
360
372
  content_layer=self.content_layer,
361
373
  )
362
- self.level += 1
363
- self.walk(element, doc)
364
- self.parents[self.level + 1] = None
365
- self.level -= 1
366
- else:
367
- self.walk(element, doc)
368
374
 
369
- elif element.text.strip():
370
- text = element.text.strip()
375
+ elif tag_name in {"pre", "code"}:
376
+ # handle monospace code snippets (pre).
377
+ text = tag.get_text(strip=True)
378
+ if text:
379
+ doc.add_code(
380
+ parent=self.parents[self.level],
381
+ text=text,
382
+ content_layer=self.content_layer,
383
+ )
371
384
 
372
- doc.add_list_item(
373
- text=text,
374
- enumerated=enumerated,
375
- marker=marker,
376
- parent=parent,
385
+ elif tag_name == "details":
386
+ # handle details and its content.
387
+ self.parents[self.level + 1] = doc.add_group(
388
+ name="details",
389
+ label=GroupLabel.SECTION,
390
+ parent=self.parents[self.level],
377
391
  content_layer=self.content_layer,
378
392
  )
379
- else:
380
- _log.debug(f"list-item has no text: {element}")
393
+ self.level += 1
394
+ self._walk(tag, doc)
395
+ self.parents[self.level + 1] = None
396
+ self.level -= 1
397
+
398
+ def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
399
+ figure = img_tag.find_parent("figure")
400
+ caption: str = ""
401
+ if isinstance(figure, Tag):
402
+ caption_tag = figure.find("figcaption", recursive=False)
403
+ if isinstance(caption_tag, Tag):
404
+ caption = caption_tag.get_text()
405
+ if not caption:
406
+ caption = str(img_tag.get("alt", "")).strip()
407
+
408
+ caption_item: Optional[TextItem] = None
409
+ if caption:
410
+ caption_item = doc.add_text(
411
+ DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
412
+ )
413
+
414
+ doc.add_picture(
415
+ caption=caption_item,
416
+ parent=self.parents[self.level],
417
+ content_layer=self.content_layer,
418
+ )
419
+
420
+ @staticmethod
421
+ def _get_cell_spans(cell: Tag) -> tuple[int, int]:
422
+ """Extract colspan and rowspan values from a table cell tag.
423
+
424
+ This function retrieves the 'colspan' and 'rowspan' attributes from a given
425
+ table cell tag.
426
+ If the attribute does not exist or it is not numeric, it defaults to 1.
427
+ """
428
+ raw_spans: tuple[str, str] = (
429
+ str(cell.get("colspan", "1")),
430
+ str(cell.get("rowspan", "1")),
431
+ )
432
+ int_spans: tuple[int, int] = (
433
+ int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
434
+ int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
435
+ )
436
+
437
+ return int_spans
381
438
 
382
439
  @staticmethod
383
440
  def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
@@ -398,10 +455,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
398
455
  if not isinstance(row, Tag):
399
456
  continue
400
457
  cell_tag = cast(Tag, cell)
401
- val = cell_tag.get("colspan", "1")
402
- colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
403
- col_count += colspan
404
- if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
458
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
459
+ col_count += col_span
460
+ if cell_tag.name == "td" or row_span == 1:
405
461
  is_row_header = False
406
462
  num_cols = max(num_cols, col_count)
407
463
  if not is_row_header:
@@ -428,10 +484,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
428
484
  row_header = True
429
485
  for html_cell in cells:
430
486
  if isinstance(html_cell, Tag):
487
+ _, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
431
488
  if html_cell.name == "td":
432
489
  col_header = False
433
490
  row_header = False
434
- elif html_cell.get("rowspan") is None:
491
+ elif row_span == 1:
435
492
  row_header = False
436
493
  if not row_header:
437
494
  row_idx += 1
@@ -456,18 +513,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
456
513
  text = html_cell.text
457
514
 
458
515
  # label = html_cell.name
459
- col_val = html_cell.get("colspan", "1")
460
- col_span = (
461
- int(col_val)
462
- if isinstance(col_val, str) and col_val.isnumeric()
463
- else 1
464
- )
465
- row_val = html_cell.get("rowspan", "1")
466
- row_span = (
467
- int(row_val)
468
- if isinstance(row_val, str) and row_val.isnumeric()
469
- else 1
470
- )
516
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
471
517
  if row_header:
472
518
  row_span -= 1
473
519
  while (
@@ -494,84 +540,3 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
494
540
  data.table_cells.append(table_cell)
495
541
 
496
542
  return data
497
-
498
- def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
499
- """Handles table tags."""
500
-
501
- table_data = HTMLDocumentBackend.parse_table_data(element)
502
-
503
- if table_data is not None:
504
- doc.add_table(
505
- data=table_data,
506
- parent=self.parents[self.level],
507
- content_layer=self.content_layer,
508
- )
509
-
510
- def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
511
- """Recursively extract text from <ul> or <ol> with proper indentation."""
512
- result = []
513
- bullet_char = "*" # Default bullet character for unordered lists
514
-
515
- if list_element.name == "ol": # For ordered lists, use numbers
516
- for i, li in enumerate(list_element("li", recursive=False), 1):
517
- if not isinstance(li, Tag):
518
- continue
519
- # Add numbering for ordered lists
520
- result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
521
- # Handle nested lists
522
- nested_list = li.find(["ul", "ol"])
523
- if isinstance(nested_list, Tag):
524
- result.extend(self.get_list_text(nested_list, level + 1))
525
- elif list_element.name == "ul": # For unordered lists, use bullet points
526
- for li in list_element("li", recursive=False):
527
- if not isinstance(li, Tag):
528
- continue
529
- # Add bullet points for unordered lists
530
- result.append(
531
- f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
532
- )
533
- # Handle nested lists
534
- nested_list = li.find(["ul", "ol"])
535
- if isinstance(nested_list, Tag):
536
- result.extend(self.get_list_text(nested_list, level + 1))
537
-
538
- return result
539
-
540
- def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
541
- """Handles image tags (img)."""
542
-
543
- # Extract the image URI from the <img> tag
544
- # image_uri = root.xpath('//figure//img/@src')[0]
545
-
546
- contains_captions = element.find(["figcaption"])
547
- if not isinstance(contains_captions, Tag):
548
- doc.add_picture(
549
- parent=self.parents[self.level],
550
- caption=None,
551
- content_layer=self.content_layer,
552
- )
553
- else:
554
- texts = []
555
- for item in contains_captions:
556
- texts.append(item.text)
557
-
558
- fig_caption = doc.add_text(
559
- label=DocItemLabel.CAPTION,
560
- text=("".join(texts)).strip(),
561
- content_layer=self.content_layer,
562
- )
563
- doc.add_picture(
564
- parent=self.parents[self.level],
565
- caption=fig_caption,
566
- content_layer=self.content_layer,
567
- )
568
-
569
- def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
570
- """Handles image tags (img)."""
571
- _log.debug(f"ignoring <img> tags at the moment: {element}")
572
-
573
- doc.add_picture(
574
- parent=self.parents[self.level],
575
- caption=None,
576
- content_layer=self.content_layer,
577
- )
@@ -93,8 +93,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
93
93
 
94
94
  # Initialize the root of the document hierarchy
95
95
  self.root: Optional[NodeItem] = None
96
-
97
- self.valid = False
96
+ self.hlevel: int = 0
97
+ self.valid: bool = False
98
98
  try:
99
99
  if isinstance(self.path_or_stream, BytesIO):
100
100
  self.path_or_stream.seek(0)
@@ -147,6 +147,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
147
147
  binary_hash=self.document_hash,
148
148
  )
149
149
  doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
150
+ self.hlevel = 0
150
151
 
151
152
  # Get metadata XML components
152
153
  xml_components: XMLComponents = self._parse_metadata()
@@ -304,7 +305,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
304
305
  title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
305
306
  if not text:
306
307
  continue
307
- parent = doc.add_heading(parent=self.root, text=title)
308
+ parent = doc.add_heading(
309
+ parent=self.root, text=title, level=self.hlevel + 1
310
+ )
308
311
  doc.add_text(
309
312
  parent=parent,
310
313
  text=text,
@@ -637,7 +640,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
637
640
  elif child.tag == "ack":
638
641
  text = DEFAULT_HEADER_ACKNOWLEDGMENTS
639
642
  if text:
640
- new_parent = doc.add_heading(text=text, parent=parent)
643
+ self.hlevel += 1
644
+ new_parent = doc.add_heading(
645
+ text=text, parent=parent, level=self.hlevel
646
+ )
641
647
  elif child.tag == "list":
642
648
  new_parent = doc.add_group(
643
649
  label=GroupLabel.LIST, name="list", parent=parent
@@ -694,6 +700,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
694
700
  new_text = self._walk_linear(doc, new_parent, child)
695
701
  if not (node.getparent().tag == "p" and node.tag in flush_tags):
696
702
  node_text += new_text
703
+ if child.tag in ("sec", "ack") and text:
704
+ self.hlevel -= 1
697
705
 
698
706
  # pick up the tail text
699
707
  node_text += child.tail.replace("\n", " ") if child.tail else ""
@@ -217,7 +217,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
217
217
 
218
218
  # GraniteVision
219
219
  granite_picture_description = PictureDescriptionVlmOptions(
220
- repo_id="ibm-granite/granite-vision-3.2-2b-preview",
220
+ repo_id="ibm-granite/granite-vision-3.3-2b",
221
221
  prompt="What is shown in this image?",
222
222
  )
223
223
 
@@ -279,6 +279,9 @@ class LayoutOptions(BaseModel):
279
279
  """Options for layout processing."""
280
280
 
281
281
  create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
282
+ keep_empty_clusters: bool = (
283
+ False # Whether to keep clusters that contain no text cells
284
+ )
282
285
  model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
283
286
 
284
287
 
@@ -1,6 +1,7 @@
1
1
  import hashlib
2
2
  import logging
3
3
  import sys
4
+ import threading
4
5
  import time
5
6
  from collections.abc import Iterable, Iterator
6
7
  from functools import partial
@@ -49,6 +50,7 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
49
50
  from docling.utils.utils import chunkify
50
51
 
51
52
  _log = logging.getLogger(__name__)
53
+ _PIPELINE_CACHE_LOCK = threading.Lock()
52
54
 
53
55
 
54
56
  class FormatOption(BaseModel):
@@ -315,17 +317,18 @@ class DocumentConverter:
315
317
  # Use a composite key to cache pipelines
316
318
  cache_key = (pipeline_class, options_hash)
317
319
 
318
- if cache_key not in self.initialized_pipelines:
319
- _log.info(
320
- f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
321
- )
322
- self.initialized_pipelines[cache_key] = pipeline_class(
323
- pipeline_options=pipeline_options
324
- )
325
- else:
326
- _log.debug(
327
- f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
328
- )
320
+ with _PIPELINE_CACHE_LOCK:
321
+ if cache_key not in self.initialized_pipelines:
322
+ _log.info(
323
+ f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
324
+ )
325
+ self.initialized_pipelines[cache_key] = pipeline_class(
326
+ pipeline_options=pipeline_options
327
+ )
328
+ else:
329
+ _log.debug(
330
+ f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
331
+ )
329
332
 
330
333
  return self.initialized_pipelines[cache_key]
331
334
 
@@ -65,6 +65,7 @@ class PictureDescriptionVlmModel(
65
65
  self.processor = AutoProcessor.from_pretrained(artifacts_path)
66
66
  self.model = AutoModelForVision2Seq.from_pretrained(
67
67
  artifacts_path,
68
+ device_map=self.device,
68
69
  torch_dtype=torch.bfloat16,
69
70
  _attn_implementation=(
70
71
  "flash_attention_2"
@@ -72,7 +73,7 @@ class PictureDescriptionVlmModel(
72
73
  and accelerator_options.cuda_use_flash_attention2
73
74
  else "eager"
74
75
  ),
75
- ).to(self.device)
76
+ )
76
77
 
77
78
  self.provenance = f"{self.options.repo_id}"
78
79
 
@@ -267,8 +267,14 @@ class LayoutPostprocessor:
267
267
  # Initial cell assignment
268
268
  clusters = self._assign_cells_to_clusters(clusters)
269
269
 
270
- # Remove clusters with no cells
271
- clusters = [cluster for cluster in clusters if cluster.cells]
270
+ # Remove clusters with no cells (if keep_empty_clusters is False),
271
+ # but always keep clusters with label DocItemLabel.FORMULA
272
+ if not self.options.keep_empty_clusters:
273
+ clusters = [
274
+ cluster
275
+ for cluster in clusters
276
+ if cluster.cells or cluster.label == DocItemLabel.FORMULA
277
+ ]
272
278
 
273
279
  # Handle orphaned cells
274
280
  unassigned = self._find_unassigned_cells(clusters)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.41.0
3
+ Version: 2.42.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -50,6 +50,7 @@ Requires-Dist: tqdm<5.0.0,>=4.65.0
50
50
  Requires-Dist: pluggy<2.0.0,>=1.0.0
51
51
  Requires-Dist: pylatexenc<3.0,>=2.10
52
52
  Requires-Dist: scipy<2.0.0,>=1.6.0
53
+ Requires-Dist: accelerate<2,>=1.0.0
53
54
  Provides-Extra: tesserocr
54
55
  Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
55
56
  Provides-Extra: ocrmac
@@ -1,5 +1,5 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- docling/document_converter.py,sha256=3jWywP_TLy-1PMvjJBUlnTM9FNzpBLRCHYA1RKFvGR4,14333
2
+ docling/document_converter.py,sha256=9aH8B30_jOYN4P_ySCCvtgEb3GoIpec15r7lEAFlMDU,14469
3
3
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
4
4
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
5
5
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -9,7 +9,7 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
9
9
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
10
10
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
11
11
  docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
12
- docling/backend/html_backend.py,sha256=Z959dzqYQO2pPE4xgPRxC5MR9j3nFGtiD6_F_osQ2iI,20670
12
+ docling/backend/html_backend.py,sha256=gGkm3i7FpW2WCJ-_GPpOJNh1LUq1_-vRGyGURuPagck,19284
13
13
  docling/backend/md_backend.py,sha256=mfwGj8g2hGC-Q_HREtl_Web65uMVXD-Ie1nRqWTXzF0,21013
14
14
  docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
15
15
  docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
@@ -20,11 +20,11 @@ docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA
20
20
  docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
23
- docling/backend/docx/latex/omml.py,sha256=nEpcfyyrOucJyj6cD7wfThrIa-q0CQCoqMb3dkrhCRg,12094
23
+ docling/backend/docx/latex/omml.py,sha256=4vh9FCbXh-Tb6KJGqNwzlMUMYEnnJgBtBI24dwy6t2U,12416
24
24
  docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
26
26
  docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- docling/backend/xml/jats_backend.py,sha256=ghGi9bHjx3BvaOtmzLw86-wZy4UxpQPOPQL4e73-BI8,24927
27
+ docling/backend/xml/jats_backend.py,sha256=LPj33EFdi2MRCakkLWrRLlUAc-B-949f8zp5gKNvBcg,25238
28
28
  docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
29
29
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
30
30
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -37,7 +37,7 @@ docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF
37
37
  docling/datamodel/base_models.py,sha256=9FslHkGUNmBp264LpLL_2JTfDAdaikldYs3SiQOHb5A,11828
38
38
  docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
39
39
  docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
40
- docling/datamodel/pipeline_options.py,sha256=aMwpbyEMbAC-xGJnjQp8iw2ocpSU4eiD8D73gHf7T4U,10033
40
+ docling/datamodel/pipeline_options.py,sha256=nlejeQjnJx2RBMkCukDECHGuVEOol9hbsSLUi2ee9hY,10134
41
41
  docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
42
42
  docling/datamodel/pipeline_options_vlm_model.py,sha256=z-pUqwRA8nJp6C3SEXZLem2zvSYdgavaAVYa8wkAIZY,2400
43
43
  docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
@@ -55,7 +55,7 @@ docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpe
55
55
  docling/models/page_preprocessing_model.py,sha256=x8MI4mvjizqEqAb5511dtrNRCJSb-lSmwHw0tmHPFiI,5103
56
56
  docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
57
57
  docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
58
- docling/models/picture_description_vlm_model.py,sha256=nAUt-eZOX2GvaCiV2BJO7VppxUbP7udVIF4oe_sEYXo,4000
58
+ docling/models/picture_description_vlm_model.py,sha256=yfyAFOy8RjxQJrafPMSAMrrpaYu3anahjRX6tCnVcs0,4028
59
59
  docling/models/rapid_ocr_model.py,sha256=AMdc66s_iWO4p6nQ0LNjQMUYVxrDSxMyLNPpjPYt6N8,5916
60
60
  docling/models/readingorder_model.py,sha256=bZoXHaSwUsa8niSmJrbCuy784ixCeBXT-RQBUfgHJ4A,14925
61
61
  docling/models/table_structure_model.py,sha256=RFXo73f2q4XuKyaSqbxpznh7JVtlLcT0FsOWl9oZbSg,12518
@@ -83,7 +83,7 @@ docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zY
83
83
  docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
84
84
  docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
85
85
  docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
86
- docling/utils/layout_postprocessor.py,sha256=QuTZZq4LNs1eM_n_2gubVfAuLBMkJiozfs3hp-jUpK4,24399
86
+ docling/utils/layout_postprocessor.py,sha256=LFLbBE-o3kWu79d8ZcyHlZPIqzQfCabZCIPTJ51lZsY,24657
87
87
  docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
88
88
  docling/utils/model_downloader.py,sha256=3vijCsAIVwWqehGBDRxRq7mJ3yRb9-zBsG00iqjqegU,4076
89
89
  docling/utils/ocr_utils.py,sha256=nmresYyfin0raanpQc_GGeU3WoLsfExf6SEXNIQ7Djg,2325
@@ -91,9 +91,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
91
91
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
92
92
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
93
93
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
94
- docling-2.41.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
95
- docling-2.41.0.dist-info/METADATA,sha256=KYqB0miKX2x2ESNy8tNHdAlyTCONqhwGLR2iag2PcQ0,10274
96
- docling-2.41.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
97
- docling-2.41.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
98
- docling-2.41.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
99
- docling-2.41.0.dist-info/RECORD,,
94
+ docling-2.42.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
95
+ docling-2.42.1.dist-info/METADATA,sha256=d46NOPDEps6dVLLMh3tWBCEQv7b_bwQQ46ndyqVO-ag,10310
96
+ docling-2.42.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
97
+ docling-2.42.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
98
+ docling-2.42.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
99
+ docling-2.42.1.dist-info/RECORD,,