docling 2.38.0__py3-none-any.whl → 2.38.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/md_backend.py +128 -43
- docling/backend/msword_backend.py +5 -1
- docling/datamodel/base_models.py +1 -1
- docling/datamodel/pipeline_options.py +1 -1
- {docling-2.38.0.dist-info → docling-2.38.1.dist-info}/METADATA +1 -1
- {docling-2.38.0.dist-info → docling-2.38.1.dist-info}/RECORD +10 -10
- {docling-2.38.0.dist-info → docling-2.38.1.dist-info}/WHEEL +0 -0
- {docling-2.38.0.dist-info → docling-2.38.1.dist-info}/entry_points.txt +0 -0
- {docling-2.38.0.dist-info → docling-2.38.1.dist-info}/licenses/LICENSE +0 -0
- {docling-2.38.0.dist-info → docling-2.38.1.dist-info}/top_level.txt +0 -0
docling/backend/md_backend.py
CHANGED
@@ -2,9 +2,10 @@ import logging
|
|
2
2
|
import re
|
3
3
|
import warnings
|
4
4
|
from copy import deepcopy
|
5
|
+
from enum import Enum
|
5
6
|
from io import BytesIO
|
6
7
|
from pathlib import Path
|
7
|
-
from typing import List, Optional, Set, Union
|
8
|
+
from typing import List, Literal, Optional, Set, Union
|
8
9
|
|
9
10
|
import marko
|
10
11
|
import marko.element
|
@@ -21,7 +22,8 @@ from docling_core.types.doc import (
|
|
21
22
|
)
|
22
23
|
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
23
24
|
from marko import Markdown
|
24
|
-
from pydantic import AnyUrl, TypeAdapter
|
25
|
+
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
|
26
|
+
from typing_extensions import Annotated
|
25
27
|
|
26
28
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
27
29
|
from docling.backend.html_backend import HTMLDocumentBackend
|
@@ -35,6 +37,31 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
|
35
37
|
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
36
38
|
|
37
39
|
|
40
|
+
class _PendingCreationType(str, Enum):
|
41
|
+
"""CoordOrigin."""
|
42
|
+
|
43
|
+
HEADING = "heading"
|
44
|
+
LIST_ITEM = "list_item"
|
45
|
+
|
46
|
+
|
47
|
+
class _HeadingCreationPayload(BaseModel):
|
48
|
+
kind: Literal["heading"] = "heading"
|
49
|
+
level: int
|
50
|
+
|
51
|
+
|
52
|
+
class _ListItemCreationPayload(BaseModel):
|
53
|
+
kind: Literal["list_item"] = "list_item"
|
54
|
+
|
55
|
+
|
56
|
+
_CreationPayload = Annotated[
|
57
|
+
Union[
|
58
|
+
_HeadingCreationPayload,
|
59
|
+
_ListItemCreationPayload,
|
60
|
+
],
|
61
|
+
Field(discriminator="kind"),
|
62
|
+
]
|
63
|
+
|
64
|
+
|
38
65
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
39
66
|
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
40
67
|
# This regex will match any sequence of underscores
|
@@ -155,6 +182,52 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
155
182
|
doc.add_table(data=table_data)
|
156
183
|
return
|
157
184
|
|
185
|
+
def _create_list_item(
|
186
|
+
self,
|
187
|
+
doc: DoclingDocument,
|
188
|
+
parent_item: Optional[NodeItem],
|
189
|
+
text: str,
|
190
|
+
formatting: Optional[Formatting] = None,
|
191
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
192
|
+
):
|
193
|
+
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
194
|
+
_log.warning("ListItem would have not had a list parent, adding one.")
|
195
|
+
parent_item = doc.add_unordered_list(parent=parent_item)
|
196
|
+
item = doc.add_list_item(
|
197
|
+
text=text,
|
198
|
+
enumerated=(isinstance(parent_item, OrderedList)),
|
199
|
+
parent=parent_item,
|
200
|
+
formatting=formatting,
|
201
|
+
hyperlink=hyperlink,
|
202
|
+
)
|
203
|
+
return item
|
204
|
+
|
205
|
+
def _create_heading_item(
|
206
|
+
self,
|
207
|
+
doc: DoclingDocument,
|
208
|
+
parent_item: Optional[NodeItem],
|
209
|
+
text: str,
|
210
|
+
level: int,
|
211
|
+
formatting: Optional[Formatting] = None,
|
212
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
213
|
+
):
|
214
|
+
if level == 1:
|
215
|
+
item = doc.add_title(
|
216
|
+
text=text,
|
217
|
+
parent=parent_item,
|
218
|
+
formatting=formatting,
|
219
|
+
hyperlink=hyperlink,
|
220
|
+
)
|
221
|
+
else:
|
222
|
+
item = doc.add_heading(
|
223
|
+
text=text,
|
224
|
+
level=level - 1,
|
225
|
+
parent=parent_item,
|
226
|
+
formatting=formatting,
|
227
|
+
hyperlink=hyperlink,
|
228
|
+
)
|
229
|
+
return item
|
230
|
+
|
158
231
|
def _iterate_elements( # noqa: C901
|
159
232
|
self,
|
160
233
|
*,
|
@@ -162,6 +235,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
162
235
|
depth: int,
|
163
236
|
doc: DoclingDocument,
|
164
237
|
visited: Set[marko.element.Element],
|
238
|
+
creation_stack: list[
|
239
|
+
_CreationPayload
|
240
|
+
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
165
241
|
parent_item: Optional[NodeItem] = None,
|
166
242
|
formatting: Optional[Formatting] = None,
|
167
243
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
@@ -177,28 +253,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
177
253
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
178
254
|
)
|
179
255
|
|
180
|
-
if len(element.children)
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
if element.level == 1:
|
188
|
-
parent_item = doc.add_title(
|
189
|
-
text=snippet_text,
|
190
|
-
parent=parent_item,
|
256
|
+
if len(element.children) > 1: # inline group will be created further down
|
257
|
+
parent_item = self._create_heading_item(
|
258
|
+
doc=doc,
|
259
|
+
parent_item=parent_item,
|
260
|
+
text="",
|
261
|
+
level=element.level,
|
191
262
|
formatting=formatting,
|
192
263
|
hyperlink=hyperlink,
|
193
264
|
)
|
194
265
|
else:
|
195
|
-
|
196
|
-
text=snippet_text,
|
197
|
-
level=element.level - 1,
|
198
|
-
parent=parent_item,
|
199
|
-
formatting=formatting,
|
200
|
-
hyperlink=hyperlink,
|
201
|
-
)
|
266
|
+
creation_stack.append(_HeadingCreationPayload(level=element.level))
|
202
267
|
|
203
268
|
elif isinstance(element, marko.block.List):
|
204
269
|
has_non_empty_list_items = False
|
@@ -224,22 +289,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
224
289
|
self._close_table(doc)
|
225
290
|
_log.debug(" - List item")
|
226
291
|
|
227
|
-
if len(child.children)
|
228
|
-
|
229
|
-
|
292
|
+
if len(child.children) > 1: # inline group will be created further down
|
293
|
+
parent_item = self._create_list_item(
|
294
|
+
doc=doc,
|
295
|
+
parent_item=parent_item,
|
296
|
+
text="",
|
297
|
+
formatting=formatting,
|
298
|
+
hyperlink=hyperlink,
|
299
|
+
)
|
230
300
|
else:
|
231
|
-
|
232
|
-
is_numbered = isinstance(parent_item, OrderedList)
|
233
|
-
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
234
|
-
_log.warning("ListItem would have not had a list parent, adding one.")
|
235
|
-
parent_item = doc.add_unordered_list(parent=parent_item)
|
236
|
-
parent_item = doc.add_list_item(
|
237
|
-
enumerated=is_numbered,
|
238
|
-
parent=parent_item,
|
239
|
-
text=snippet_text,
|
240
|
-
formatting=formatting,
|
241
|
-
hyperlink=hyperlink,
|
242
|
-
)
|
301
|
+
creation_stack.append(_ListItemCreationPayload())
|
243
302
|
|
244
303
|
elif isinstance(element, marko.inline.Image):
|
245
304
|
self._close_table(doc)
|
@@ -285,13 +344,38 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
285
344
|
self.md_table_buffer.append(snippet_text)
|
286
345
|
elif snippet_text:
|
287
346
|
self._close_table(doc)
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
347
|
+
|
348
|
+
if creation_stack:
|
349
|
+
while len(creation_stack) > 0:
|
350
|
+
to_create = creation_stack.pop()
|
351
|
+
if isinstance(to_create, _ListItemCreationPayload):
|
352
|
+
parent_item = self._create_list_item(
|
353
|
+
doc=doc,
|
354
|
+
parent_item=parent_item,
|
355
|
+
text=snippet_text,
|
356
|
+
formatting=formatting,
|
357
|
+
hyperlink=hyperlink,
|
358
|
+
)
|
359
|
+
elif isinstance(to_create, _HeadingCreationPayload):
|
360
|
+
# not keeping as parent_item as logic for correctly tracking
|
361
|
+
# that not implemented yet (section components not captured
|
362
|
+
# as heading children in marko)
|
363
|
+
self._create_heading_item(
|
364
|
+
doc=doc,
|
365
|
+
parent_item=parent_item,
|
366
|
+
text=snippet_text,
|
367
|
+
level=to_create.level,
|
368
|
+
formatting=formatting,
|
369
|
+
hyperlink=hyperlink,
|
370
|
+
)
|
371
|
+
else:
|
372
|
+
doc.add_text(
|
373
|
+
label=DocItemLabel.TEXT,
|
374
|
+
parent=parent_item,
|
375
|
+
text=snippet_text,
|
376
|
+
formatting=formatting,
|
377
|
+
hyperlink=hyperlink,
|
378
|
+
)
|
295
379
|
|
296
380
|
elif isinstance(element, marko.inline.CodeSpan):
|
297
381
|
self._close_table(doc)
|
@@ -353,7 +437,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
353
437
|
parent_item = doc.add_inline_group(parent=parent_item)
|
354
438
|
|
355
439
|
processed_block_types = (
|
356
|
-
# marko.block.Heading,
|
357
440
|
marko.block.CodeBlock,
|
358
441
|
marko.block.FencedCode,
|
359
442
|
marko.inline.RawText,
|
@@ -369,6 +452,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
369
452
|
depth=depth + 1,
|
370
453
|
doc=doc,
|
371
454
|
visited=visited,
|
455
|
+
creation_stack=creation_stack,
|
372
456
|
parent_item=parent_item,
|
373
457
|
formatting=formatting,
|
374
458
|
hyperlink=hyperlink,
|
@@ -412,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
412
496
|
doc=doc,
|
413
497
|
parent_item=None,
|
414
498
|
visited=set(),
|
499
|
+
creation_stack=[],
|
415
500
|
)
|
416
501
|
self._close_table(doc=doc) # handle any last hanging table
|
417
502
|
|
@@ -397,7 +397,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
397
397
|
if isinstance(c, Hyperlink):
|
398
398
|
text = c.text
|
399
399
|
hyperlink = Path(c.address)
|
400
|
-
format =
|
400
|
+
format = (
|
401
|
+
self._get_format_from_run(c.runs[0])
|
402
|
+
if c.runs and len(c.runs) > 0
|
403
|
+
else None
|
404
|
+
)
|
401
405
|
elif isinstance(c, Run):
|
402
406
|
text = c.text
|
403
407
|
hyperlink = None
|
docling/datamodel/base_models.py
CHANGED
@@ -207,7 +207,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
|
|
207
207
|
|
208
208
|
# GraniteVision
|
209
209
|
granite_picture_description = PictureDescriptionVlmOptions(
|
210
|
-
repo_id="ibm-granite/granite-vision-3.
|
210
|
+
repo_id="ibm-granite/granite-vision-3.2-2b-preview",
|
211
211
|
prompt="What is shown in this image?",
|
212
212
|
)
|
213
213
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.38.
|
3
|
+
Version: 2.38.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -10,10 +10,10 @@ docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3U
|
|
10
10
|
docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
|
11
11
|
docling/backend/docling_parse_v4_backend.py,sha256=7tQvpCwpYoq98PNszDkrXaFhy5eWmQqMP4RjWWPLPgw,6197
|
12
12
|
docling/backend/html_backend.py,sha256=3K-l5SUAAyqISNEb7nPst_I51xzYOVOkgmwXh3lv9sw,21063
|
13
|
-
docling/backend/md_backend.py,sha256=
|
13
|
+
docling/backend/md_backend.py,sha256=kSQ7dn_IrAmt53kL_0Z5LnpE2fWif9RkBAGtqzgfQaM,20514
|
14
14
|
docling/backend/msexcel_backend.py,sha256=3j0WQfqDpgPXdPMCguefdv7arcNVDedPD6gl54cmLn8,18110
|
15
15
|
docling/backend/mspowerpoint_backend.py,sha256=0lsb8ZeQFxbDt7jZpSQyk5wYHYa3SP2T2y2dMI-o30o,15216
|
16
|
-
docling/backend/msword_backend.py,sha256=
|
16
|
+
docling/backend/msword_backend.py,sha256=xj009k1s7uzmNx3yGZZelsSgxa6ylaJ1yYHxYfHVLOo,44975
|
17
17
|
docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
|
18
18
|
docling/backend/pdf_backend.py,sha256=KE9TMuFO5WX-o5A_DAd4tEaLi4HMZ4XjKdpllItVkWM,2238
|
19
19
|
docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
|
@@ -34,9 +34,9 @@ docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
|
34
34
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
35
|
docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
|
36
36
|
docling/datamodel/asr_model_specs.py,sha256=L7ETXsUKVbPsVcPLhEIMxQjd4UzMGZBVsy74CLsZBkU,2181
|
37
|
-
docling/datamodel/base_models.py,sha256=
|
37
|
+
docling/datamodel/base_models.py,sha256=67o1ptOTT8tW7i-g6gM2JKEX_1CDbmKEMQ_B9ZYM2z0,11156
|
38
38
|
docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
|
39
|
-
docling/datamodel/pipeline_options.py,sha256=
|
39
|
+
docling/datamodel/pipeline_options.py,sha256=7mKv1IThXYpu3osggp_Y2h7E5C8nbxJLQXS7JJPMvYQ,9479
|
40
40
|
docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
|
41
41
|
docling/datamodel/pipeline_options_vlm_model.py,sha256=rtDMVtKFZbgQD269w8FvHMXEhdRBrsA4rVYk6A-M-b4,2063
|
42
42
|
docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
|
@@ -90,9 +90,9 @@ docling/utils/orientation.py,sha256=xXlOfowL54FKwjsTFrM7y3ogk1wChLNn_-u74tYIf1s,
|
|
90
90
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
91
91
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
92
92
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
93
|
-
docling-2.38.
|
94
|
-
docling-2.38.
|
95
|
-
docling-2.38.
|
96
|
-
docling-2.38.
|
97
|
-
docling-2.38.
|
98
|
-
docling-2.38.
|
93
|
+
docling-2.38.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
94
|
+
docling-2.38.1.dist-info/METADATA,sha256=14E9MwQXlyuB4nWa31ZTjW6vvv5p2eCs2xxVTE4-qT4,10273
|
95
|
+
docling-2.38.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
96
|
+
docling-2.38.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
97
|
+
docling-2.38.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
98
|
+
docling-2.38.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|