docling-core 2.23.2__py3-none-any.whl → 2.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +2 -2
- docling_core/experimental/serializer/common.py +250 -196
- docling_core/experimental/serializer/doctags.py +492 -0
- docling_core/experimental/serializer/markdown.py +80 -43
- docling_core/types/doc/document.py +412 -418
- docling_core/types/doc/page.py +18 -6
- docling_core/types/doc/tokens.py +192 -26
- {docling_core-2.23.2.dist-info → docling_core-2.24.0.dist-info}/METADATA +1 -1
- {docling_core-2.23.2.dist-info → docling_core-2.24.0.dist-info}/RECORD +12 -11
- {docling_core-2.23.2.dist-info → docling_core-2.24.0.dist-info}/LICENSE +0 -0
- {docling_core-2.23.2.dist-info → docling_core-2.24.0.dist-info}/WHEEL +0 -0
- {docling_core-2.23.2.dist-info → docling_core-2.24.0.dist-info}/entry_points.txt +0 -0
|
@@ -26,12 +26,14 @@ from docling_core.experimental.serializer.base import (
|
|
|
26
26
|
BaseTextSerializer,
|
|
27
27
|
SerializationResult,
|
|
28
28
|
)
|
|
29
|
-
from docling_core.experimental.serializer.common import DocSerializer
|
|
29
|
+
from docling_core.experimental.serializer.common import CommonParams, DocSerializer
|
|
30
30
|
from docling_core.types.doc.base import ImageRefMode
|
|
31
31
|
from docling_core.types.doc.document import (
|
|
32
32
|
CodeItem,
|
|
33
|
+
ContentLayer,
|
|
33
34
|
DocItem,
|
|
34
35
|
DoclingDocument,
|
|
36
|
+
FloatingItem,
|
|
35
37
|
Formatting,
|
|
36
38
|
FormItem,
|
|
37
39
|
FormulaItem,
|
|
@@ -49,10 +51,20 @@ from docling_core.types.doc.document import (
|
|
|
49
51
|
)
|
|
50
52
|
|
|
51
53
|
|
|
52
|
-
class
|
|
53
|
-
"""Markdown-specific
|
|
54
|
+
class MarkdownParams(CommonParams):
|
|
55
|
+
"""Markdown-specific serialization parameters."""
|
|
54
56
|
|
|
57
|
+
layers: set[ContentLayer] = {ContentLayer.BODY}
|
|
58
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
|
|
59
|
+
image_placeholder: str = "<!-- image -->"
|
|
60
|
+
indent: int = 4
|
|
55
61
|
wrap_width: Optional[PositiveInt] = None
|
|
62
|
+
page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
|
|
63
|
+
escape_underscores: bool = True
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
67
|
+
"""Markdown-specific text item serializer."""
|
|
56
68
|
|
|
57
69
|
@override
|
|
58
70
|
def serialize(
|
|
@@ -65,37 +77,47 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
65
77
|
**kwargs,
|
|
66
78
|
) -> SerializationResult:
|
|
67
79
|
"""Serializes the passed item."""
|
|
80
|
+
params = MarkdownParams(**kwargs)
|
|
81
|
+
parts: list[str] = []
|
|
68
82
|
escape_html = True
|
|
69
83
|
escape_underscores = True
|
|
70
84
|
if isinstance(item, TitleItem):
|
|
71
|
-
|
|
85
|
+
text = f"# {item.text}"
|
|
72
86
|
elif isinstance(item, SectionHeaderItem):
|
|
73
|
-
|
|
87
|
+
text = f"{(item.level + 1) * '#'} {item.text}"
|
|
74
88
|
elif isinstance(item, CodeItem):
|
|
75
|
-
|
|
89
|
+
text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
|
|
76
90
|
escape_html = False
|
|
77
91
|
escape_underscores = False
|
|
78
92
|
elif isinstance(item, FormulaItem):
|
|
79
93
|
if item.text:
|
|
80
|
-
|
|
94
|
+
text = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
|
|
81
95
|
elif item.orig:
|
|
82
|
-
|
|
96
|
+
text = "<!-- formula-not-decoded -->"
|
|
83
97
|
else:
|
|
84
|
-
|
|
98
|
+
text = ""
|
|
85
99
|
escape_html = False
|
|
86
100
|
escape_underscores = False
|
|
87
|
-
elif
|
|
88
|
-
|
|
101
|
+
elif params.wrap_width:
|
|
102
|
+
text = textwrap.fill(item.text, width=params.wrap_width)
|
|
89
103
|
else:
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
104
|
+
text = item.text
|
|
105
|
+
parts.append(text)
|
|
106
|
+
|
|
107
|
+
if isinstance(item, FloatingItem):
|
|
108
|
+
cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
|
|
109
|
+
if cap_text:
|
|
110
|
+
parts.append(cap_text)
|
|
111
|
+
|
|
112
|
+
text_res = (" " if is_inline_scope else "\n\n").join(parts)
|
|
113
|
+
text_res = doc_serializer.post_process(
|
|
114
|
+
text=text_res,
|
|
93
115
|
escape_html=escape_html,
|
|
94
116
|
escape_underscores=escape_underscores,
|
|
95
117
|
formatting=item.formatting,
|
|
96
118
|
hyperlink=item.hyperlink,
|
|
97
119
|
)
|
|
98
|
-
return SerializationResult(text=
|
|
120
|
+
return SerializationResult(text=text_res)
|
|
99
121
|
|
|
100
122
|
|
|
101
123
|
class MarkdownTableSerializer(BaseTableSerializer):
|
|
@@ -113,12 +135,14 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
113
135
|
"""Serializes the passed item."""
|
|
114
136
|
text_parts: list[str] = []
|
|
115
137
|
|
|
116
|
-
|
|
138
|
+
cap_res = doc_serializer.serialize_captions(
|
|
117
139
|
item=item,
|
|
118
|
-
|
|
119
|
-
|
|
140
|
+
**kwargs,
|
|
141
|
+
)
|
|
142
|
+
if cap_res.text:
|
|
143
|
+
text_parts.append(cap_res.text)
|
|
120
144
|
|
|
121
|
-
if item.self_ref not in doc_serializer.get_excluded_refs():
|
|
145
|
+
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
122
146
|
rows = [
|
|
123
147
|
[
|
|
124
148
|
# make sure that md tables are not broken
|
|
@@ -158,33 +182,26 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
158
182
|
item: PictureItem,
|
|
159
183
|
doc_serializer: BaseDocSerializer,
|
|
160
184
|
doc: DoclingDocument,
|
|
161
|
-
image_mode: Optional[ImageRefMode] = None,
|
|
162
|
-
image_placeholder: Optional[str] = None,
|
|
163
185
|
**kwargs,
|
|
164
186
|
) -> SerializationResult:
|
|
165
187
|
"""Serializes the passed item."""
|
|
166
|
-
|
|
167
|
-
image_mode if image_mode is not None else ImageRefMode.PLACEHOLDER
|
|
168
|
-
)
|
|
169
|
-
my_image_placeholder = (
|
|
170
|
-
image_placeholder if image_placeholder is not None else "<!-- image -->"
|
|
171
|
-
)
|
|
188
|
+
params = MarkdownParams(**kwargs)
|
|
172
189
|
|
|
173
190
|
texts: list[str] = []
|
|
174
191
|
|
|
175
192
|
cap_res = doc_serializer.serialize_captions(
|
|
176
193
|
item=item,
|
|
177
|
-
|
|
194
|
+
**kwargs,
|
|
178
195
|
)
|
|
179
196
|
if cap_res.text:
|
|
180
197
|
texts.append(cap_res.text)
|
|
181
198
|
|
|
182
|
-
if item.self_ref not in doc_serializer.get_excluded_refs():
|
|
199
|
+
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
183
200
|
img_res = self._serialize_image_part(
|
|
184
201
|
item=item,
|
|
185
202
|
doc=doc,
|
|
186
|
-
image_mode=
|
|
187
|
-
image_placeholder=
|
|
203
|
+
image_mode=params.image_mode,
|
|
204
|
+
image_placeholder=params.image_placeholder,
|
|
188
205
|
)
|
|
189
206
|
if img_res.text:
|
|
190
207
|
texts.append(img_res.text)
|
|
@@ -288,8 +305,6 @@ class MarkdownFormSerializer(BaseFormSerializer):
|
|
|
288
305
|
class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
289
306
|
"""Markdown-specific list serializer."""
|
|
290
307
|
|
|
291
|
-
indent: int = 4
|
|
292
|
-
|
|
293
308
|
@override
|
|
294
309
|
def serialize(
|
|
295
310
|
self,
|
|
@@ -303,16 +318,26 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
303
318
|
**kwargs,
|
|
304
319
|
) -> SerializationResult:
|
|
305
320
|
"""Serializes the passed item."""
|
|
321
|
+
params = MarkdownParams(**kwargs)
|
|
306
322
|
my_visited = visited or set()
|
|
307
323
|
parts = doc_serializer.get_parts(
|
|
308
|
-
|
|
324
|
+
item=item,
|
|
309
325
|
list_level=list_level + 1,
|
|
310
326
|
is_inline_scope=is_inline_scope,
|
|
311
327
|
visited=my_visited,
|
|
328
|
+
**kwargs,
|
|
312
329
|
)
|
|
313
|
-
|
|
330
|
+
sep = "\n"
|
|
331
|
+
my_parts: list[SerializationResult] = []
|
|
332
|
+
for p in parts:
|
|
333
|
+
if p.text and p.text[0] == " " and my_parts:
|
|
334
|
+
my_parts[-1].text = sep.join([my_parts[-1].text, p.text]) # update last
|
|
335
|
+
else:
|
|
336
|
+
my_parts.append(p)
|
|
337
|
+
|
|
338
|
+
indent_str = list_level * params.indent * " "
|
|
314
339
|
is_ol = isinstance(item, OrderedList)
|
|
315
|
-
text_res =
|
|
340
|
+
text_res = sep.join(
|
|
316
341
|
[
|
|
317
342
|
# avoid additional marker on already evaled sublists
|
|
318
343
|
(
|
|
@@ -320,7 +345,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
320
345
|
if c.text and c.text[0] == " "
|
|
321
346
|
else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c.text}"
|
|
322
347
|
)
|
|
323
|
-
for i, c in enumerate(
|
|
348
|
+
for i, c in enumerate(my_parts)
|
|
324
349
|
]
|
|
325
350
|
)
|
|
326
351
|
return SerializationResult(text=text_res)
|
|
@@ -343,7 +368,7 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
|
|
|
343
368
|
"""Serializes the passed item."""
|
|
344
369
|
my_visited = visited or set()
|
|
345
370
|
parts = doc_serializer.get_parts(
|
|
346
|
-
|
|
371
|
+
item=item,
|
|
347
372
|
list_level=list_level,
|
|
348
373
|
is_inline_scope=True,
|
|
349
374
|
visited=my_visited,
|
|
@@ -385,6 +410,8 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
385
410
|
list_serializer: BaseListSerializer = MarkdownListSerializer()
|
|
386
411
|
inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
|
|
387
412
|
|
|
413
|
+
params: MarkdownParams = MarkdownParams()
|
|
414
|
+
|
|
388
415
|
@override
|
|
389
416
|
def serialize_bold(self, text: str, **kwargs):
|
|
390
417
|
"""Apply Markdown-specific bold serialization."""
|
|
@@ -442,7 +469,8 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
442
469
|
) -> str:
|
|
443
470
|
"""Apply some text post-processing steps."""
|
|
444
471
|
res = text
|
|
445
|
-
|
|
472
|
+
params = self.params.merge_with_patch(patch=kwargs)
|
|
473
|
+
if escape_underscores and params.escape_underscores:
|
|
446
474
|
res = self._escape_underscores(text)
|
|
447
475
|
if escape_html:
|
|
448
476
|
res = html.escape(res, quote=False)
|
|
@@ -454,8 +482,17 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
454
482
|
return res
|
|
455
483
|
|
|
456
484
|
@override
|
|
457
|
-
def
|
|
458
|
-
"""
|
|
459
|
-
|
|
460
|
-
text_res = "\n\n".join([p.text for p in parts if p.text])
|
|
485
|
+
def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
|
|
486
|
+
"""Serialize a page out of its parts."""
|
|
487
|
+
text_res = "\n\n".join([p.text for p in parts])
|
|
461
488
|
return SerializationResult(text=text_res)
|
|
489
|
+
|
|
490
|
+
@override
|
|
491
|
+
def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
|
|
492
|
+
"""Serialize a document out of its pages."""
|
|
493
|
+
if self.params.page_break_placeholder is not None:
|
|
494
|
+
sep = f"\n\n{self.params.page_break_placeholder}\n\n"
|
|
495
|
+
text_res = sep.join([p.text for p in pages if p.text])
|
|
496
|
+
return SerializationResult(text=text_res)
|
|
497
|
+
else:
|
|
498
|
+
return self.serialize_page(parts=pages)
|