docling 2.38.0__py3-none-any.whl → 2.38.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,9 +2,10 @@ import logging
2
2
  import re
3
3
  import warnings
4
4
  from copy import deepcopy
5
+ from enum import Enum
5
6
  from io import BytesIO
6
7
  from pathlib import Path
7
- from typing import List, Optional, Set, Union
8
+ from typing import List, Literal, Optional, Set, Union
8
9
 
9
10
  import marko
10
11
  import marko.element
@@ -21,7 +22,8 @@ from docling_core.types.doc import (
21
22
  )
22
23
  from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
23
24
  from marko import Markdown
24
- from pydantic import AnyUrl, TypeAdapter
25
+ from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
26
+ from typing_extensions import Annotated
25
27
 
26
28
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
29
  from docling.backend.html_backend import HTMLDocumentBackend
@@ -35,6 +37,31 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
35
37
  _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
36
38
 
37
39
 
40
+ class _PendingCreationType(str, Enum):
41
+ """CoordOrigin."""
42
+
43
+ HEADING = "heading"
44
+ LIST_ITEM = "list_item"
45
+
46
+
47
+ class _HeadingCreationPayload(BaseModel):
48
+ kind: Literal["heading"] = "heading"
49
+ level: int
50
+
51
+
52
+ class _ListItemCreationPayload(BaseModel):
53
+ kind: Literal["list_item"] = "list_item"
54
+
55
+
56
+ _CreationPayload = Annotated[
57
+ Union[
58
+ _HeadingCreationPayload,
59
+ _ListItemCreationPayload,
60
+ ],
61
+ Field(discriminator="kind"),
62
+ ]
63
+
64
+
38
65
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
39
66
  def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
40
67
  # This regex will match any sequence of underscores
@@ -155,6 +182,52 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
155
182
  doc.add_table(data=table_data)
156
183
  return
157
184
 
185
+ def _create_list_item(
186
+ self,
187
+ doc: DoclingDocument,
188
+ parent_item: Optional[NodeItem],
189
+ text: str,
190
+ formatting: Optional[Formatting] = None,
191
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
192
+ ):
193
+ if not isinstance(parent_item, (OrderedList, UnorderedList)):
194
+ _log.warning("ListItem would have not had a list parent, adding one.")
195
+ parent_item = doc.add_unordered_list(parent=parent_item)
196
+ item = doc.add_list_item(
197
+ text=text,
198
+ enumerated=(isinstance(parent_item, OrderedList)),
199
+ parent=parent_item,
200
+ formatting=formatting,
201
+ hyperlink=hyperlink,
202
+ )
203
+ return item
204
+
205
+ def _create_heading_item(
206
+ self,
207
+ doc: DoclingDocument,
208
+ parent_item: Optional[NodeItem],
209
+ text: str,
210
+ level: int,
211
+ formatting: Optional[Formatting] = None,
212
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
213
+ ):
214
+ if level == 1:
215
+ item = doc.add_title(
216
+ text=text,
217
+ parent=parent_item,
218
+ formatting=formatting,
219
+ hyperlink=hyperlink,
220
+ )
221
+ else:
222
+ item = doc.add_heading(
223
+ text=text,
224
+ level=level - 1,
225
+ parent=parent_item,
226
+ formatting=formatting,
227
+ hyperlink=hyperlink,
228
+ )
229
+ return item
230
+
158
231
  def _iterate_elements( # noqa: C901
159
232
  self,
160
233
  *,
@@ -162,6 +235,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
162
235
  depth: int,
163
236
  doc: DoclingDocument,
164
237
  visited: Set[marko.element.Element],
238
+ creation_stack: list[
239
+ _CreationPayload
240
+ ], # stack for lazy item creation triggered deep in marko's AST (on RawText)
165
241
  parent_item: Optional[NodeItem] = None,
166
242
  formatting: Optional[Formatting] = None,
167
243
  hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -177,28 +253,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
177
253
  f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
178
254
  )
179
255
 
180
- if len(element.children) == 1:
181
- child = element.children[0]
182
- snippet_text = str(child.children) # type: ignore
183
- visited.add(child)
184
- else:
185
- snippet_text = "" # inline group will be created
186
-
187
- if element.level == 1:
188
- parent_item = doc.add_title(
189
- text=snippet_text,
190
- parent=parent_item,
256
+ if len(element.children) > 1: # inline group will be created further down
257
+ parent_item = self._create_heading_item(
258
+ doc=doc,
259
+ parent_item=parent_item,
260
+ text="",
261
+ level=element.level,
191
262
  formatting=formatting,
192
263
  hyperlink=hyperlink,
193
264
  )
194
265
  else:
195
- parent_item = doc.add_heading(
196
- text=snippet_text,
197
- level=element.level - 1,
198
- parent=parent_item,
199
- formatting=formatting,
200
- hyperlink=hyperlink,
201
- )
266
+ creation_stack.append(_HeadingCreationPayload(level=element.level))
202
267
 
203
268
  elif isinstance(element, marko.block.List):
204
269
  has_non_empty_list_items = False
@@ -224,22 +289,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
224
289
  self._close_table(doc)
225
290
  _log.debug(" - List item")
226
291
 
227
- if len(child.children) == 1:
228
- snippet_text = str(child.children[0].children) # type: ignore
229
- visited.add(child)
292
+ if len(child.children) > 1: # inline group will be created further down
293
+ parent_item = self._create_list_item(
294
+ doc=doc,
295
+ parent_item=parent_item,
296
+ text="",
297
+ formatting=formatting,
298
+ hyperlink=hyperlink,
299
+ )
230
300
  else:
231
- snippet_text = "" # inline group will be created
232
- is_numbered = isinstance(parent_item, OrderedList)
233
- if not isinstance(parent_item, (OrderedList, UnorderedList)):
234
- _log.warning("ListItem would have not had a list parent, adding one.")
235
- parent_item = doc.add_unordered_list(parent=parent_item)
236
- parent_item = doc.add_list_item(
237
- enumerated=is_numbered,
238
- parent=parent_item,
239
- text=snippet_text,
240
- formatting=formatting,
241
- hyperlink=hyperlink,
242
- )
301
+ creation_stack.append(_ListItemCreationPayload())
243
302
 
244
303
  elif isinstance(element, marko.inline.Image):
245
304
  self._close_table(doc)
@@ -285,13 +344,38 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
285
344
  self.md_table_buffer.append(snippet_text)
286
345
  elif snippet_text:
287
346
  self._close_table(doc)
288
- doc.add_text(
289
- label=DocItemLabel.TEXT,
290
- parent=parent_item,
291
- text=snippet_text,
292
- formatting=formatting,
293
- hyperlink=hyperlink,
294
- )
347
+
348
+ if creation_stack:
349
+ while len(creation_stack) > 0:
350
+ to_create = creation_stack.pop()
351
+ if isinstance(to_create, _ListItemCreationPayload):
352
+ parent_item = self._create_list_item(
353
+ doc=doc,
354
+ parent_item=parent_item,
355
+ text=snippet_text,
356
+ formatting=formatting,
357
+ hyperlink=hyperlink,
358
+ )
359
+ elif isinstance(to_create, _HeadingCreationPayload):
360
+ # not keeping as parent_item as logic for correctly tracking
361
+ # that not implemented yet (section components not captured
362
+ # as heading children in marko)
363
+ self._create_heading_item(
364
+ doc=doc,
365
+ parent_item=parent_item,
366
+ text=snippet_text,
367
+ level=to_create.level,
368
+ formatting=formatting,
369
+ hyperlink=hyperlink,
370
+ )
371
+ else:
372
+ doc.add_text(
373
+ label=DocItemLabel.TEXT,
374
+ parent=parent_item,
375
+ text=snippet_text,
376
+ formatting=formatting,
377
+ hyperlink=hyperlink,
378
+ )
295
379
 
296
380
  elif isinstance(element, marko.inline.CodeSpan):
297
381
  self._close_table(doc)
@@ -353,7 +437,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
353
437
  parent_item = doc.add_inline_group(parent=parent_item)
354
438
 
355
439
  processed_block_types = (
356
- # marko.block.Heading,
357
440
  marko.block.CodeBlock,
358
441
  marko.block.FencedCode,
359
442
  marko.inline.RawText,
@@ -369,6 +452,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
369
452
  depth=depth + 1,
370
453
  doc=doc,
371
454
  visited=visited,
455
+ creation_stack=creation_stack,
372
456
  parent_item=parent_item,
373
457
  formatting=formatting,
374
458
  hyperlink=hyperlink,
@@ -412,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
412
496
  doc=doc,
413
497
  parent_item=None,
414
498
  visited=set(),
499
+ creation_stack=[],
415
500
  )
416
501
  self._close_table(doc=doc) # handle any last hanging table
417
502
 
@@ -397,7 +397,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
397
397
  if isinstance(c, Hyperlink):
398
398
  text = c.text
399
399
  hyperlink = Path(c.address)
400
- format = self._get_format_from_run(c.runs[0])
400
+ format = (
401
+ self._get_format_from_run(c.runs[0])
402
+ if c.runs and len(c.runs) > 0
403
+ else None
404
+ )
401
405
  elif isinstance(c, Run):
402
406
  text = c.text
403
407
  hyperlink = None
@@ -301,7 +301,7 @@ class OpenAiChatMessage(BaseModel):
301
301
  class OpenAiResponseChoice(BaseModel):
302
302
  index: int
303
303
  message: OpenAiChatMessage
304
- finish_reason: str
304
+ finish_reason: Optional[str]
305
305
 
306
306
 
307
307
  class OpenAiResponseUsage(BaseModel):
@@ -207,7 +207,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
207
207
 
208
208
  # GraniteVision
209
209
  granite_picture_description = PictureDescriptionVlmOptions(
210
- repo_id="ibm-granite/granite-vision-3.1-2b-preview",
210
+ repo_id="ibm-granite/granite-vision-3.2-2b-preview",
211
211
  prompt="What is shown in this image?",
212
212
  )
213
213
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.38.0
3
+ Version: 2.38.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -10,10 +10,10 @@ docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3U
10
10
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
11
11
  docling/backend/docling_parse_v4_backend.py,sha256=7tQvpCwpYoq98PNszDkrXaFhy5eWmQqMP4RjWWPLPgw,6197
12
12
  docling/backend/html_backend.py,sha256=3K-l5SUAAyqISNEb7nPst_I51xzYOVOkgmwXh3lv9sw,21063
13
- docling/backend/md_backend.py,sha256=ghIU_NSaENKrRu49Dn5GvjYtcAgEU7ZHbf-TeYg49nY,17673
13
+ docling/backend/md_backend.py,sha256=kSQ7dn_IrAmt53kL_0Z5LnpE2fWif9RkBAGtqzgfQaM,20514
14
14
  docling/backend/msexcel_backend.py,sha256=3j0WQfqDpgPXdPMCguefdv7arcNVDedPD6gl54cmLn8,18110
15
15
  docling/backend/mspowerpoint_backend.py,sha256=0lsb8ZeQFxbDt7jZpSQyk5wYHYa3SP2T2y2dMI-o30o,15216
16
- docling/backend/msword_backend.py,sha256=C4qs4mQEt1JzonCg5v6_yUxdngzcTzSO9k1ik8_DW5Q,44855
16
+ docling/backend/msword_backend.py,sha256=xj009k1s7uzmNx3yGZZelsSgxa6ylaJ1yYHxYfHVLOo,44975
17
17
  docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
18
18
  docling/backend/pdf_backend.py,sha256=KE9TMuFO5WX-o5A_DAd4tEaLi4HMZ4XjKdpllItVkWM,2238
19
19
  docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
@@ -34,9 +34,9 @@ docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
34
34
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
36
36
  docling/datamodel/asr_model_specs.py,sha256=L7ETXsUKVbPsVcPLhEIMxQjd4UzMGZBVsy74CLsZBkU,2181
37
- docling/datamodel/base_models.py,sha256=L35qXLmADZQNEzBC0M6K2xrfLyqrTqDlbPD6E6DkWMc,11146
37
+ docling/datamodel/base_models.py,sha256=67o1ptOTT8tW7i-g6gM2JKEX_1CDbmKEMQ_B9ZYM2z0,11156
38
38
  docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
39
- docling/datamodel/pipeline_options.py,sha256=N7my7hmvuX6EzlujHeF6RObPSrG_HjN_nfPzILTqP-E,9479
39
+ docling/datamodel/pipeline_options.py,sha256=7mKv1IThXYpu3osggp_Y2h7E5C8nbxJLQXS7JJPMvYQ,9479
40
40
  docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
41
41
  docling/datamodel/pipeline_options_vlm_model.py,sha256=rtDMVtKFZbgQD269w8FvHMXEhdRBrsA4rVYk6A-M-b4,2063
42
42
  docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
@@ -90,9 +90,9 @@ docling/utils/orientation.py,sha256=xXlOfowL54FKwjsTFrM7y3ogk1wChLNn_-u74tYIf1s,
90
90
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
91
91
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
92
92
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
93
- docling-2.38.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
94
- docling-2.38.0.dist-info/METADATA,sha256=vT8Zko4wD8iyKUjLAJ83Cm7ntscjEk5ojHvcJXlvT5A,10273
95
- docling-2.38.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
96
- docling-2.38.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
97
- docling-2.38.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
98
- docling-2.38.0.dist-info/RECORD,,
93
+ docling-2.38.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
94
+ docling-2.38.1.dist-info/METADATA,sha256=14E9MwQXlyuB4nWa31ZTjW6vvv5p2eCs2xxVTE4-qT4,10273
95
+ docling-2.38.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
96
+ docling-2.38.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
97
+ docling-2.38.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
98
+ docling-2.38.1.dist-info/RECORD,,