docling-core 2.21.2__py3-none-any.whl → 2.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -0,0 +1,461 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2025
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define classes for Markdown serialization."""
7
+ import html
8
+ import re
9
+ import textwrap
10
+ from pathlib import Path
11
+ from typing import Optional, Union
12
+
13
+ from pydantic import AnyUrl, BaseModel, PositiveInt
14
+ from tabulate import tabulate
15
+ from typing_extensions import override
16
+
17
+ from docling_core.experimental.serializer.base import (
18
+ BaseDocSerializer,
19
+ BaseFallbackSerializer,
20
+ BaseFormSerializer,
21
+ BaseInlineSerializer,
22
+ BaseKeyValueSerializer,
23
+ BaseListSerializer,
24
+ BasePictureSerializer,
25
+ BaseTableSerializer,
26
+ BaseTextSerializer,
27
+ SerializationResult,
28
+ )
29
+ from docling_core.experimental.serializer.common import DocSerializer
30
+ from docling_core.types.doc.base import ImageRefMode
31
+ from docling_core.types.doc.document import (
32
+ CodeItem,
33
+ DocItem,
34
+ DoclingDocument,
35
+ Formatting,
36
+ FormItem,
37
+ FormulaItem,
38
+ ImageRef,
39
+ InlineGroup,
40
+ KeyValueItem,
41
+ NodeItem,
42
+ OrderedList,
43
+ PictureItem,
44
+ SectionHeaderItem,
45
+ TableItem,
46
+ TextItem,
47
+ TitleItem,
48
+ UnorderedList,
49
+ )
50
+
51
+
52
+ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
53
+ """Markdown-specific text item serializer."""
54
+
55
+ wrap_width: Optional[PositiveInt] = None
56
+
57
+ @override
58
+ def serialize(
59
+ self,
60
+ *,
61
+ item: TextItem,
62
+ doc_serializer: BaseDocSerializer,
63
+ doc: DoclingDocument,
64
+ is_inline_scope: bool = False,
65
+ **kwargs,
66
+ ) -> SerializationResult:
67
+ """Serializes the passed item."""
68
+ escape_html = True
69
+ escape_underscores = True
70
+ if isinstance(item, TitleItem):
71
+ res = f"# {item.text}"
72
+ elif isinstance(item, SectionHeaderItem):
73
+ res = f"{(item.level + 1) * '#'} {item.text}"
74
+ elif isinstance(item, CodeItem):
75
+ res = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
76
+ escape_html = False
77
+ escape_underscores = False
78
+ elif isinstance(item, FormulaItem):
79
+ if item.text:
80
+ res = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
81
+ elif item.orig:
82
+ res = "<!-- formula-not-decoded -->"
83
+ else:
84
+ res = ""
85
+ escape_html = False
86
+ escape_underscores = False
87
+ elif self.wrap_width:
88
+ res = textwrap.fill(item.text, width=self.wrap_width)
89
+ else:
90
+ res = item.text
91
+ res = doc_serializer.post_process(
92
+ text=res,
93
+ escape_html=escape_html,
94
+ escape_underscores=escape_underscores,
95
+ formatting=item.formatting,
96
+ hyperlink=item.hyperlink,
97
+ )
98
+ return SerializationResult(text=res)
99
+
100
+
101
+ class MarkdownTableSerializer(BaseTableSerializer):
102
+ """Markdown-specific table item serializer."""
103
+
104
+ @override
105
+ def serialize(
106
+ self,
107
+ *,
108
+ item: TableItem,
109
+ doc_serializer: BaseDocSerializer,
110
+ doc: DoclingDocument,
111
+ **kwargs,
112
+ ) -> SerializationResult:
113
+ """Serializes the passed item."""
114
+ text_parts: list[str] = []
115
+
116
+ if caption_txt := doc_serializer.serialize_captions(
117
+ item=item,
118
+ ).text:
119
+ text_parts.append(caption_txt)
120
+
121
+ if item.self_ref not in doc_serializer.get_excluded_refs():
122
+ rows = [
123
+ [
124
+ # make sure that md tables are not broken
125
+ # due to newline chars in the text
126
+ col.text.replace("\n", " ")
127
+ for col in row
128
+ ]
129
+ for row in item.data.grid
130
+ ]
131
+ if len(rows) > 1 and len(rows[0]) > 0:
132
+ try:
133
+ table_text = tabulate(rows[1:], headers=rows[0], tablefmt="github")
134
+ except ValueError:
135
+ table_text = tabulate(
136
+ rows[1:],
137
+ headers=rows[0],
138
+ tablefmt="github",
139
+ disable_numparse=True,
140
+ )
141
+ else:
142
+ table_text = ""
143
+ if table_text:
144
+ text_parts.append(table_text)
145
+
146
+ text_res = "\n\n".join(text_parts)
147
+
148
+ return SerializationResult(text=text_res)
149
+
150
+
151
+ class MarkdownPictureSerializer(BasePictureSerializer):
152
+ """Markdown-specific picture item serializer."""
153
+
154
+ @override
155
+ def serialize(
156
+ self,
157
+ *,
158
+ item: PictureItem,
159
+ doc_serializer: BaseDocSerializer,
160
+ doc: DoclingDocument,
161
+ image_mode: Optional[ImageRefMode] = None,
162
+ image_placeholder: Optional[str] = None,
163
+ **kwargs,
164
+ ) -> SerializationResult:
165
+ """Serializes the passed item."""
166
+ my_image_mode = (
167
+ image_mode if image_mode is not None else ImageRefMode.PLACEHOLDER
168
+ )
169
+ my_image_placeholder = (
170
+ image_placeholder if image_placeholder is not None else "<!-- image -->"
171
+ )
172
+
173
+ texts: list[str] = []
174
+
175
+ cap_res = doc_serializer.serialize_captions(
176
+ item=item,
177
+ separator="\n",
178
+ )
179
+ if cap_res.text:
180
+ texts.append(cap_res.text)
181
+
182
+ if item.self_ref not in doc_serializer.get_excluded_refs():
183
+ img_res = self._serialize_image_part(
184
+ item=item,
185
+ doc=doc,
186
+ image_mode=my_image_mode,
187
+ image_placeholder=my_image_placeholder,
188
+ )
189
+ if img_res.text:
190
+ texts.append(img_res.text)
191
+
192
+ text_res = "\n\n".join(texts)
193
+
194
+ return SerializationResult(text=text_res)
195
+
196
+ def _serialize_image_part(
197
+ self,
198
+ item: PictureItem,
199
+ doc: DoclingDocument,
200
+ image_mode: ImageRefMode,
201
+ image_placeholder: str,
202
+ **kwargs,
203
+ ) -> SerializationResult:
204
+ error_response = (
205
+ "<!-- 🖼️❌ Image not available. "
206
+ "Please use `PdfPipelineOptions(generate_picture_images=True)`"
207
+ " -->"
208
+ )
209
+ if image_mode == ImageRefMode.PLACEHOLDER:
210
+ text_res = image_placeholder
211
+ elif image_mode == ImageRefMode.EMBEDDED:
212
+ # short-cut: we already have the image in base64
213
+ if (
214
+ isinstance(item.image, ImageRef)
215
+ and isinstance(item.image.uri, AnyUrl)
216
+ and item.image.uri.scheme == "data"
217
+ ):
218
+ text = f"![Image]({item.image.uri})"
219
+ text_res = text
220
+ else:
221
+ # get the item.image._pil or crop it out of the page-image
222
+ img = item.get_image(doc=doc)
223
+
224
+ if img is not None:
225
+ imgb64 = item._image_to_base64(img)
226
+ text = f"![Image](data:image/png;base64,{imgb64})"
227
+
228
+ text_res = text
229
+ else:
230
+ text_res = error_response
231
+ elif image_mode == ImageRefMode.REFERENCED:
232
+ if not isinstance(item.image, ImageRef) or (
233
+ isinstance(item.image.uri, AnyUrl) and item.image.uri.scheme == "data"
234
+ ):
235
+ text_res = image_placeholder
236
+ else:
237
+ text_res = f"![Image]({str(item.image.uri)})"
238
+ else:
239
+ text_res = image_placeholder
240
+
241
+ return SerializationResult(text=text_res)
242
+
243
+
244
+ class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
245
+ """Markdown-specific key-value item serializer."""
246
+
247
+ @override
248
+ def serialize(
249
+ self,
250
+ *,
251
+ item: KeyValueItem,
252
+ doc_serializer: "BaseDocSerializer",
253
+ doc: DoclingDocument,
254
+ **kwargs,
255
+ ) -> SerializationResult:
256
+ """Serializes the passed item."""
257
+ # TODO add actual implementation
258
+ text_res = (
259
+ "<!-- missing-key-value-item -->"
260
+ if item.self_ref not in doc_serializer.get_excluded_refs()
261
+ else ""
262
+ )
263
+ return SerializationResult(text=text_res)
264
+
265
+
266
+ class MarkdownFormSerializer(BaseFormSerializer):
267
+ """Markdown-specific form item serializer."""
268
+
269
+ @override
270
+ def serialize(
271
+ self,
272
+ *,
273
+ item: FormItem,
274
+ doc_serializer: "BaseDocSerializer",
275
+ doc: DoclingDocument,
276
+ **kwargs,
277
+ ) -> SerializationResult:
278
+ """Serializes the passed item."""
279
+ # TODO add actual implementation
280
+ text_res = (
281
+ "<!-- missing-form-item -->"
282
+ if item.self_ref not in doc_serializer.get_excluded_refs()
283
+ else ""
284
+ )
285
+ return SerializationResult(text=text_res)
286
+
287
+
288
+ class MarkdownListSerializer(BaseModel, BaseListSerializer):
289
+ """Markdown-specific list serializer."""
290
+
291
+ indent: int = 4
292
+
293
+ @override
294
+ def serialize(
295
+ self,
296
+ *,
297
+ item: Union[UnorderedList, OrderedList],
298
+ doc_serializer: "BaseDocSerializer",
299
+ doc: DoclingDocument,
300
+ list_level: int = 0,
301
+ is_inline_scope: bool = False,
302
+ visited: Optional[set[str]] = None, # refs of visited items
303
+ **kwargs,
304
+ ) -> SerializationResult:
305
+ """Serializes the passed item."""
306
+ my_visited = visited or set()
307
+ parts = doc_serializer.get_parts(
308
+ node=item,
309
+ list_level=list_level + 1,
310
+ is_inline_scope=is_inline_scope,
311
+ visited=my_visited,
312
+ )
313
+ indent_str = list_level * self.indent * " "
314
+ is_ol = isinstance(item, OrderedList)
315
+ text_res = "\n".join(
316
+ [
317
+ # avoid additional marker on already evaled sublists
318
+ (
319
+ c.text
320
+ if c.text and c.text[0] == " "
321
+ else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c.text}"
322
+ )
323
+ for i, c in enumerate(parts)
324
+ ]
325
+ )
326
+ return SerializationResult(text=text_res)
327
+
328
+
329
+ class MarkdownInlineSerializer(BaseInlineSerializer):
330
+ """Markdown-specific inline group serializer."""
331
+
332
+ @override
333
+ def serialize(
334
+ self,
335
+ *,
336
+ item: InlineGroup,
337
+ doc_serializer: "BaseDocSerializer",
338
+ doc: DoclingDocument,
339
+ list_level: int = 0,
340
+ visited: Optional[set[str]] = None, # refs of visited items
341
+ **kwargs,
342
+ ) -> SerializationResult:
343
+ """Serializes the passed item."""
344
+ my_visited = visited or set()
345
+ parts = doc_serializer.get_parts(
346
+ node=item,
347
+ list_level=list_level,
348
+ is_inline_scope=True,
349
+ visited=my_visited,
350
+ )
351
+ text_res = " ".join([p.text for p in parts if p.text])
352
+ return SerializationResult(text=text_res)
353
+
354
+
355
+ class MarkdownFallbackSerializer(BaseFallbackSerializer):
356
+ """Markdown-specific fallback serializer."""
357
+
358
+ @override
359
+ def serialize(
360
+ self,
361
+ *,
362
+ item: NodeItem,
363
+ doc_serializer: "BaseDocSerializer",
364
+ doc: DoclingDocument,
365
+ **kwargs,
366
+ ) -> SerializationResult:
367
+ """Serializes the passed item."""
368
+ if isinstance(item, DocItem):
369
+ text_res = "<!-- missing-text -->"
370
+ else:
371
+ text_res = "" # TODO go with explicit None return type?
372
+ return SerializationResult(text=text_res)
373
+
374
+
375
+ class MarkdownDocSerializer(DocSerializer):
376
+ """Markdown-specific document serializer."""
377
+
378
+ text_serializer: BaseTextSerializer = MarkdownTextSerializer()
379
+ table_serializer: BaseTableSerializer = MarkdownTableSerializer()
380
+ picture_serializer: BasePictureSerializer = MarkdownPictureSerializer()
381
+ key_value_serializer: BaseKeyValueSerializer = MarkdownKeyValueSerializer()
382
+ form_serializer: BaseFormSerializer = MarkdownFormSerializer()
383
+ fallback_serializer: BaseFallbackSerializer = MarkdownFallbackSerializer()
384
+
385
+ list_serializer: BaseListSerializer = MarkdownListSerializer()
386
+ inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
387
+
388
+ @override
389
+ def serialize_bold(self, text: str, **kwargs):
390
+ """Apply Markdown-specific bold serialization."""
391
+ return f"**{text}**"
392
+
393
+ @override
394
+ def serialize_italic(self, text: str, **kwargs):
395
+ """Apply Markdown-specific italic serialization."""
396
+ return f"*{text}*"
397
+
398
+ @override
399
+ def serialize_strikethrough(self, text: str, **kwargs):
400
+ """Apply Markdown-specific strikethrough serialization."""
401
+ return f"~~{text}~~"
402
+
403
+ @override
404
+ def serialize_hyperlink(self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs):
405
+ """Apply Markdown-specific hyperlink serialization."""
406
+ return f"[{text}]({str(hyperlink)})"
407
+
408
+ @classmethod
409
+ def _escape_underscores(cls, text: str):
410
+ """Escape underscores but leave them intact in the URL.."""
411
+ # Firstly, identify all the URL patterns.
412
+ url_pattern = r"!\[.*?\]\((.*?)\)"
413
+
414
+ parts = []
415
+ last_end = 0
416
+
417
+ for match in re.finditer(url_pattern, text):
418
+ # Text to add before the URL (needs to be escaped)
419
+ before_url = text[last_end : match.start()]
420
+ parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
421
+
422
+ # Add the full URL part (do not escape)
423
+ parts.append(match.group(0))
424
+ last_end = match.end()
425
+
426
+ # Add the final part of the text (which needs to be escaped)
427
+ if last_end < len(text):
428
+ parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
429
+
430
+ return "".join(parts)
431
+ # return text.replace("_", r"\_")
432
+
433
+ def post_process(
434
+ self,
435
+ text: str,
436
+ *,
437
+ escape_html: bool = True,
438
+ escape_underscores: bool = True,
439
+ formatting: Optional[Formatting] = None,
440
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
441
+ **kwargs,
442
+ ) -> str:
443
+ """Apply some text post-processing steps."""
444
+ res = text
445
+ if escape_underscores and self.escape_underscores:
446
+ res = self._escape_underscores(text)
447
+ if escape_html:
448
+ res = html.escape(res, quote=False)
449
+ res = super().post_process(
450
+ text=res,
451
+ formatting=formatting,
452
+ hyperlink=hyperlink,
453
+ )
454
+ return res
455
+
456
+ @override
457
+ def serialize(self, **kwargs) -> SerializationResult:
458
+ """Run the serialization."""
459
+ parts = self.get_parts()
460
+ text_res = "\n\n".join([p.text for p in parts if p.text])
461
+ return SerializationResult(text=text_res)