docling-core 2.21.2__py3-none-any.whl → 2.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/__init__.py +6 -0
- docling_core/experimental/serializer/__init__.py +6 -0
- docling_core/experimental/serializer/base.py +227 -0
- docling_core/experimental/serializer/common.py +353 -0
- docling_core/experimental/serializer/markdown.py +461 -0
- docling_core/types/doc/document.py +779 -330
- docling_core/types/doc/page.py +1238 -0
- docling_core/types/doc/tokens.py +1 -0
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/METADATA +1 -1
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/RECORD +13 -7
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/LICENSE +0 -0
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/WHEEL +0 -0
- {docling_core-2.21.2.dist-info → docling_core-2.23.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,461 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2025
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define classes for Markdown serialization."""
|
|
7
|
+
import html
|
|
8
|
+
import re
|
|
9
|
+
import textwrap
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Optional, Union
|
|
12
|
+
|
|
13
|
+
from pydantic import AnyUrl, BaseModel, PositiveInt
|
|
14
|
+
from tabulate import tabulate
|
|
15
|
+
from typing_extensions import override
|
|
16
|
+
|
|
17
|
+
from docling_core.experimental.serializer.base import (
|
|
18
|
+
BaseDocSerializer,
|
|
19
|
+
BaseFallbackSerializer,
|
|
20
|
+
BaseFormSerializer,
|
|
21
|
+
BaseInlineSerializer,
|
|
22
|
+
BaseKeyValueSerializer,
|
|
23
|
+
BaseListSerializer,
|
|
24
|
+
BasePictureSerializer,
|
|
25
|
+
BaseTableSerializer,
|
|
26
|
+
BaseTextSerializer,
|
|
27
|
+
SerializationResult,
|
|
28
|
+
)
|
|
29
|
+
from docling_core.experimental.serializer.common import DocSerializer
|
|
30
|
+
from docling_core.types.doc.base import ImageRefMode
|
|
31
|
+
from docling_core.types.doc.document import (
|
|
32
|
+
CodeItem,
|
|
33
|
+
DocItem,
|
|
34
|
+
DoclingDocument,
|
|
35
|
+
Formatting,
|
|
36
|
+
FormItem,
|
|
37
|
+
FormulaItem,
|
|
38
|
+
ImageRef,
|
|
39
|
+
InlineGroup,
|
|
40
|
+
KeyValueItem,
|
|
41
|
+
NodeItem,
|
|
42
|
+
OrderedList,
|
|
43
|
+
PictureItem,
|
|
44
|
+
SectionHeaderItem,
|
|
45
|
+
TableItem,
|
|
46
|
+
TextItem,
|
|
47
|
+
TitleItem,
|
|
48
|
+
UnorderedList,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
53
|
+
"""Markdown-specific text item serializer."""
|
|
54
|
+
|
|
55
|
+
wrap_width: Optional[PositiveInt] = None
|
|
56
|
+
|
|
57
|
+
@override
|
|
58
|
+
def serialize(
|
|
59
|
+
self,
|
|
60
|
+
*,
|
|
61
|
+
item: TextItem,
|
|
62
|
+
doc_serializer: BaseDocSerializer,
|
|
63
|
+
doc: DoclingDocument,
|
|
64
|
+
is_inline_scope: bool = False,
|
|
65
|
+
**kwargs,
|
|
66
|
+
) -> SerializationResult:
|
|
67
|
+
"""Serializes the passed item."""
|
|
68
|
+
escape_html = True
|
|
69
|
+
escape_underscores = True
|
|
70
|
+
if isinstance(item, TitleItem):
|
|
71
|
+
res = f"# {item.text}"
|
|
72
|
+
elif isinstance(item, SectionHeaderItem):
|
|
73
|
+
res = f"{(item.level + 1) * '#'} {item.text}"
|
|
74
|
+
elif isinstance(item, CodeItem):
|
|
75
|
+
res = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
|
|
76
|
+
escape_html = False
|
|
77
|
+
escape_underscores = False
|
|
78
|
+
elif isinstance(item, FormulaItem):
|
|
79
|
+
if item.text:
|
|
80
|
+
res = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
|
|
81
|
+
elif item.orig:
|
|
82
|
+
res = "<!-- formula-not-decoded -->"
|
|
83
|
+
else:
|
|
84
|
+
res = ""
|
|
85
|
+
escape_html = False
|
|
86
|
+
escape_underscores = False
|
|
87
|
+
elif self.wrap_width:
|
|
88
|
+
res = textwrap.fill(item.text, width=self.wrap_width)
|
|
89
|
+
else:
|
|
90
|
+
res = item.text
|
|
91
|
+
res = doc_serializer.post_process(
|
|
92
|
+
text=res,
|
|
93
|
+
escape_html=escape_html,
|
|
94
|
+
escape_underscores=escape_underscores,
|
|
95
|
+
formatting=item.formatting,
|
|
96
|
+
hyperlink=item.hyperlink,
|
|
97
|
+
)
|
|
98
|
+
return SerializationResult(text=res)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class MarkdownTableSerializer(BaseTableSerializer):
|
|
102
|
+
"""Markdown-specific table item serializer."""
|
|
103
|
+
|
|
104
|
+
@override
|
|
105
|
+
def serialize(
|
|
106
|
+
self,
|
|
107
|
+
*,
|
|
108
|
+
item: TableItem,
|
|
109
|
+
doc_serializer: BaseDocSerializer,
|
|
110
|
+
doc: DoclingDocument,
|
|
111
|
+
**kwargs,
|
|
112
|
+
) -> SerializationResult:
|
|
113
|
+
"""Serializes the passed item."""
|
|
114
|
+
text_parts: list[str] = []
|
|
115
|
+
|
|
116
|
+
if caption_txt := doc_serializer.serialize_captions(
|
|
117
|
+
item=item,
|
|
118
|
+
).text:
|
|
119
|
+
text_parts.append(caption_txt)
|
|
120
|
+
|
|
121
|
+
if item.self_ref not in doc_serializer.get_excluded_refs():
|
|
122
|
+
rows = [
|
|
123
|
+
[
|
|
124
|
+
# make sure that md tables are not broken
|
|
125
|
+
# due to newline chars in the text
|
|
126
|
+
col.text.replace("\n", " ")
|
|
127
|
+
for col in row
|
|
128
|
+
]
|
|
129
|
+
for row in item.data.grid
|
|
130
|
+
]
|
|
131
|
+
if len(rows) > 1 and len(rows[0]) > 0:
|
|
132
|
+
try:
|
|
133
|
+
table_text = tabulate(rows[1:], headers=rows[0], tablefmt="github")
|
|
134
|
+
except ValueError:
|
|
135
|
+
table_text = tabulate(
|
|
136
|
+
rows[1:],
|
|
137
|
+
headers=rows[0],
|
|
138
|
+
tablefmt="github",
|
|
139
|
+
disable_numparse=True,
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
table_text = ""
|
|
143
|
+
if table_text:
|
|
144
|
+
text_parts.append(table_text)
|
|
145
|
+
|
|
146
|
+
text_res = "\n\n".join(text_parts)
|
|
147
|
+
|
|
148
|
+
return SerializationResult(text=text_res)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class MarkdownPictureSerializer(BasePictureSerializer):
|
|
152
|
+
"""Markdown-specific picture item serializer."""
|
|
153
|
+
|
|
154
|
+
@override
|
|
155
|
+
def serialize(
|
|
156
|
+
self,
|
|
157
|
+
*,
|
|
158
|
+
item: PictureItem,
|
|
159
|
+
doc_serializer: BaseDocSerializer,
|
|
160
|
+
doc: DoclingDocument,
|
|
161
|
+
image_mode: Optional[ImageRefMode] = None,
|
|
162
|
+
image_placeholder: Optional[str] = None,
|
|
163
|
+
**kwargs,
|
|
164
|
+
) -> SerializationResult:
|
|
165
|
+
"""Serializes the passed item."""
|
|
166
|
+
my_image_mode = (
|
|
167
|
+
image_mode if image_mode is not None else ImageRefMode.PLACEHOLDER
|
|
168
|
+
)
|
|
169
|
+
my_image_placeholder = (
|
|
170
|
+
image_placeholder if image_placeholder is not None else "<!-- image -->"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
texts: list[str] = []
|
|
174
|
+
|
|
175
|
+
cap_res = doc_serializer.serialize_captions(
|
|
176
|
+
item=item,
|
|
177
|
+
separator="\n",
|
|
178
|
+
)
|
|
179
|
+
if cap_res.text:
|
|
180
|
+
texts.append(cap_res.text)
|
|
181
|
+
|
|
182
|
+
if item.self_ref not in doc_serializer.get_excluded_refs():
|
|
183
|
+
img_res = self._serialize_image_part(
|
|
184
|
+
item=item,
|
|
185
|
+
doc=doc,
|
|
186
|
+
image_mode=my_image_mode,
|
|
187
|
+
image_placeholder=my_image_placeholder,
|
|
188
|
+
)
|
|
189
|
+
if img_res.text:
|
|
190
|
+
texts.append(img_res.text)
|
|
191
|
+
|
|
192
|
+
text_res = "\n\n".join(texts)
|
|
193
|
+
|
|
194
|
+
return SerializationResult(text=text_res)
|
|
195
|
+
|
|
196
|
+
def _serialize_image_part(
|
|
197
|
+
self,
|
|
198
|
+
item: PictureItem,
|
|
199
|
+
doc: DoclingDocument,
|
|
200
|
+
image_mode: ImageRefMode,
|
|
201
|
+
image_placeholder: str,
|
|
202
|
+
**kwargs,
|
|
203
|
+
) -> SerializationResult:
|
|
204
|
+
error_response = (
|
|
205
|
+
"<!-- 🖼️❌ Image not available. "
|
|
206
|
+
"Please use `PdfPipelineOptions(generate_picture_images=True)`"
|
|
207
|
+
" -->"
|
|
208
|
+
)
|
|
209
|
+
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
210
|
+
text_res = image_placeholder
|
|
211
|
+
elif image_mode == ImageRefMode.EMBEDDED:
|
|
212
|
+
# short-cut: we already have the image in base64
|
|
213
|
+
if (
|
|
214
|
+
isinstance(item.image, ImageRef)
|
|
215
|
+
and isinstance(item.image.uri, AnyUrl)
|
|
216
|
+
and item.image.uri.scheme == "data"
|
|
217
|
+
):
|
|
218
|
+
text = f""
|
|
219
|
+
text_res = text
|
|
220
|
+
else:
|
|
221
|
+
# get the item.image._pil or crop it out of the page-image
|
|
222
|
+
img = item.get_image(doc=doc)
|
|
223
|
+
|
|
224
|
+
if img is not None:
|
|
225
|
+
imgb64 = item._image_to_base64(img)
|
|
226
|
+
text = f""
|
|
227
|
+
|
|
228
|
+
text_res = text
|
|
229
|
+
else:
|
|
230
|
+
text_res = error_response
|
|
231
|
+
elif image_mode == ImageRefMode.REFERENCED:
|
|
232
|
+
if not isinstance(item.image, ImageRef) or (
|
|
233
|
+
isinstance(item.image.uri, AnyUrl) and item.image.uri.scheme == "data"
|
|
234
|
+
):
|
|
235
|
+
text_res = image_placeholder
|
|
236
|
+
else:
|
|
237
|
+
text_res = f"})"
|
|
238
|
+
else:
|
|
239
|
+
text_res = image_placeholder
|
|
240
|
+
|
|
241
|
+
return SerializationResult(text=text_res)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
|
|
245
|
+
"""Markdown-specific key-value item serializer."""
|
|
246
|
+
|
|
247
|
+
@override
|
|
248
|
+
def serialize(
|
|
249
|
+
self,
|
|
250
|
+
*,
|
|
251
|
+
item: KeyValueItem,
|
|
252
|
+
doc_serializer: "BaseDocSerializer",
|
|
253
|
+
doc: DoclingDocument,
|
|
254
|
+
**kwargs,
|
|
255
|
+
) -> SerializationResult:
|
|
256
|
+
"""Serializes the passed item."""
|
|
257
|
+
# TODO add actual implementation
|
|
258
|
+
text_res = (
|
|
259
|
+
"<!-- missing-key-value-item -->"
|
|
260
|
+
if item.self_ref not in doc_serializer.get_excluded_refs()
|
|
261
|
+
else ""
|
|
262
|
+
)
|
|
263
|
+
return SerializationResult(text=text_res)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class MarkdownFormSerializer(BaseFormSerializer):
|
|
267
|
+
"""Markdown-specific form item serializer."""
|
|
268
|
+
|
|
269
|
+
@override
|
|
270
|
+
def serialize(
|
|
271
|
+
self,
|
|
272
|
+
*,
|
|
273
|
+
item: FormItem,
|
|
274
|
+
doc_serializer: "BaseDocSerializer",
|
|
275
|
+
doc: DoclingDocument,
|
|
276
|
+
**kwargs,
|
|
277
|
+
) -> SerializationResult:
|
|
278
|
+
"""Serializes the passed item."""
|
|
279
|
+
# TODO add actual implementation
|
|
280
|
+
text_res = (
|
|
281
|
+
"<!-- missing-form-item -->"
|
|
282
|
+
if item.self_ref not in doc_serializer.get_excluded_refs()
|
|
283
|
+
else ""
|
|
284
|
+
)
|
|
285
|
+
return SerializationResult(text=text_res)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
289
|
+
"""Markdown-specific list serializer."""
|
|
290
|
+
|
|
291
|
+
indent: int = 4
|
|
292
|
+
|
|
293
|
+
@override
|
|
294
|
+
def serialize(
|
|
295
|
+
self,
|
|
296
|
+
*,
|
|
297
|
+
item: Union[UnorderedList, OrderedList],
|
|
298
|
+
doc_serializer: "BaseDocSerializer",
|
|
299
|
+
doc: DoclingDocument,
|
|
300
|
+
list_level: int = 0,
|
|
301
|
+
is_inline_scope: bool = False,
|
|
302
|
+
visited: Optional[set[str]] = None, # refs of visited items
|
|
303
|
+
**kwargs,
|
|
304
|
+
) -> SerializationResult:
|
|
305
|
+
"""Serializes the passed item."""
|
|
306
|
+
my_visited = visited or set()
|
|
307
|
+
parts = doc_serializer.get_parts(
|
|
308
|
+
node=item,
|
|
309
|
+
list_level=list_level + 1,
|
|
310
|
+
is_inline_scope=is_inline_scope,
|
|
311
|
+
visited=my_visited,
|
|
312
|
+
)
|
|
313
|
+
indent_str = list_level * self.indent * " "
|
|
314
|
+
is_ol = isinstance(item, OrderedList)
|
|
315
|
+
text_res = "\n".join(
|
|
316
|
+
[
|
|
317
|
+
# avoid additional marker on already evaled sublists
|
|
318
|
+
(
|
|
319
|
+
c.text
|
|
320
|
+
if c.text and c.text[0] == " "
|
|
321
|
+
else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c.text}"
|
|
322
|
+
)
|
|
323
|
+
for i, c in enumerate(parts)
|
|
324
|
+
]
|
|
325
|
+
)
|
|
326
|
+
return SerializationResult(text=text_res)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
class MarkdownInlineSerializer(BaseInlineSerializer):
|
|
330
|
+
"""Markdown-specific inline group serializer."""
|
|
331
|
+
|
|
332
|
+
@override
|
|
333
|
+
def serialize(
|
|
334
|
+
self,
|
|
335
|
+
*,
|
|
336
|
+
item: InlineGroup,
|
|
337
|
+
doc_serializer: "BaseDocSerializer",
|
|
338
|
+
doc: DoclingDocument,
|
|
339
|
+
list_level: int = 0,
|
|
340
|
+
visited: Optional[set[str]] = None, # refs of visited items
|
|
341
|
+
**kwargs,
|
|
342
|
+
) -> SerializationResult:
|
|
343
|
+
"""Serializes the passed item."""
|
|
344
|
+
my_visited = visited or set()
|
|
345
|
+
parts = doc_serializer.get_parts(
|
|
346
|
+
node=item,
|
|
347
|
+
list_level=list_level,
|
|
348
|
+
is_inline_scope=True,
|
|
349
|
+
visited=my_visited,
|
|
350
|
+
)
|
|
351
|
+
text_res = " ".join([p.text for p in parts if p.text])
|
|
352
|
+
return SerializationResult(text=text_res)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
class MarkdownFallbackSerializer(BaseFallbackSerializer):
|
|
356
|
+
"""Markdown-specific fallback serializer."""
|
|
357
|
+
|
|
358
|
+
@override
|
|
359
|
+
def serialize(
|
|
360
|
+
self,
|
|
361
|
+
*,
|
|
362
|
+
item: NodeItem,
|
|
363
|
+
doc_serializer: "BaseDocSerializer",
|
|
364
|
+
doc: DoclingDocument,
|
|
365
|
+
**kwargs,
|
|
366
|
+
) -> SerializationResult:
|
|
367
|
+
"""Serializes the passed item."""
|
|
368
|
+
if isinstance(item, DocItem):
|
|
369
|
+
text_res = "<!-- missing-text -->"
|
|
370
|
+
else:
|
|
371
|
+
text_res = "" # TODO go with explicit None return type?
|
|
372
|
+
return SerializationResult(text=text_res)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
class MarkdownDocSerializer(DocSerializer):
|
|
376
|
+
"""Markdown-specific document serializer."""
|
|
377
|
+
|
|
378
|
+
text_serializer: BaseTextSerializer = MarkdownTextSerializer()
|
|
379
|
+
table_serializer: BaseTableSerializer = MarkdownTableSerializer()
|
|
380
|
+
picture_serializer: BasePictureSerializer = MarkdownPictureSerializer()
|
|
381
|
+
key_value_serializer: BaseKeyValueSerializer = MarkdownKeyValueSerializer()
|
|
382
|
+
form_serializer: BaseFormSerializer = MarkdownFormSerializer()
|
|
383
|
+
fallback_serializer: BaseFallbackSerializer = MarkdownFallbackSerializer()
|
|
384
|
+
|
|
385
|
+
list_serializer: BaseListSerializer = MarkdownListSerializer()
|
|
386
|
+
inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
|
|
387
|
+
|
|
388
|
+
@override
|
|
389
|
+
def serialize_bold(self, text: str, **kwargs):
|
|
390
|
+
"""Apply Markdown-specific bold serialization."""
|
|
391
|
+
return f"**{text}**"
|
|
392
|
+
|
|
393
|
+
@override
|
|
394
|
+
def serialize_italic(self, text: str, **kwargs):
|
|
395
|
+
"""Apply Markdown-specific italic serialization."""
|
|
396
|
+
return f"*{text}*"
|
|
397
|
+
|
|
398
|
+
@override
|
|
399
|
+
def serialize_strikethrough(self, text: str, **kwargs):
|
|
400
|
+
"""Apply Markdown-specific strikethrough serialization."""
|
|
401
|
+
return f"~~{text}~~"
|
|
402
|
+
|
|
403
|
+
@override
|
|
404
|
+
def serialize_hyperlink(self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs):
|
|
405
|
+
"""Apply Markdown-specific hyperlink serialization."""
|
|
406
|
+
return f"[{text}]({str(hyperlink)})"
|
|
407
|
+
|
|
408
|
+
@classmethod
|
|
409
|
+
def _escape_underscores(cls, text: str):
|
|
410
|
+
"""Escape underscores but leave them intact in the URL.."""
|
|
411
|
+
# Firstly, identify all the URL patterns.
|
|
412
|
+
url_pattern = r"!\[.*?\]\((.*?)\)"
|
|
413
|
+
|
|
414
|
+
parts = []
|
|
415
|
+
last_end = 0
|
|
416
|
+
|
|
417
|
+
for match in re.finditer(url_pattern, text):
|
|
418
|
+
# Text to add before the URL (needs to be escaped)
|
|
419
|
+
before_url = text[last_end : match.start()]
|
|
420
|
+
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
|
|
421
|
+
|
|
422
|
+
# Add the full URL part (do not escape)
|
|
423
|
+
parts.append(match.group(0))
|
|
424
|
+
last_end = match.end()
|
|
425
|
+
|
|
426
|
+
# Add the final part of the text (which needs to be escaped)
|
|
427
|
+
if last_end < len(text):
|
|
428
|
+
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
|
|
429
|
+
|
|
430
|
+
return "".join(parts)
|
|
431
|
+
# return text.replace("_", r"\_")
|
|
432
|
+
|
|
433
|
+
def post_process(
|
|
434
|
+
self,
|
|
435
|
+
text: str,
|
|
436
|
+
*,
|
|
437
|
+
escape_html: bool = True,
|
|
438
|
+
escape_underscores: bool = True,
|
|
439
|
+
formatting: Optional[Formatting] = None,
|
|
440
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
|
441
|
+
**kwargs,
|
|
442
|
+
) -> str:
|
|
443
|
+
"""Apply some text post-processing steps."""
|
|
444
|
+
res = text
|
|
445
|
+
if escape_underscores and self.escape_underscores:
|
|
446
|
+
res = self._escape_underscores(text)
|
|
447
|
+
if escape_html:
|
|
448
|
+
res = html.escape(res, quote=False)
|
|
449
|
+
res = super().post_process(
|
|
450
|
+
text=res,
|
|
451
|
+
formatting=formatting,
|
|
452
|
+
hyperlink=hyperlink,
|
|
453
|
+
)
|
|
454
|
+
return res
|
|
455
|
+
|
|
456
|
+
@override
|
|
457
|
+
def serialize(self, **kwargs) -> SerializationResult:
|
|
458
|
+
"""Run the serialization."""
|
|
459
|
+
parts = self.get_parts()
|
|
460
|
+
text_res = "\n\n".join([p.text for p in parts if p.text])
|
|
461
|
+
return SerializationResult(text=text_res)
|