docling-core 2.23.3__py3-none-any.whl → 2.24.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +2 -2
- docling_core/experimental/serializer/common.py +250 -196
- docling_core/experimental/serializer/doctags.py +492 -0
- docling_core/experimental/serializer/markdown.py +70 -41
- docling_core/types/doc/document.py +412 -418
- docling_core/types/doc/page.py +28 -9
- docling_core/types/doc/tokens.py +192 -26
- {docling_core-2.23.3.dist-info → docling_core-2.24.1.dist-info}/METADATA +1 -1
- {docling_core-2.23.3.dist-info → docling_core-2.24.1.dist-info}/RECORD +12 -11
- {docling_core-2.23.3.dist-info → docling_core-2.24.1.dist-info}/LICENSE +0 -0
- {docling_core-2.23.3.dist-info → docling_core-2.24.1.dist-info}/WHEEL +0 -0
- {docling_core-2.23.3.dist-info → docling_core-2.24.1.dist-info}/entry_points.txt +0 -0
|
@@ -26,12 +26,14 @@ from docling_core.experimental.serializer.base import (
|
|
|
26
26
|
BaseTextSerializer,
|
|
27
27
|
SerializationResult,
|
|
28
28
|
)
|
|
29
|
-
from docling_core.experimental.serializer.common import DocSerializer
|
|
29
|
+
from docling_core.experimental.serializer.common import CommonParams, DocSerializer
|
|
30
30
|
from docling_core.types.doc.base import ImageRefMode
|
|
31
31
|
from docling_core.types.doc.document import (
|
|
32
32
|
CodeItem,
|
|
33
|
+
ContentLayer,
|
|
33
34
|
DocItem,
|
|
34
35
|
DoclingDocument,
|
|
36
|
+
FloatingItem,
|
|
35
37
|
Formatting,
|
|
36
38
|
FormItem,
|
|
37
39
|
FormulaItem,
|
|
@@ -49,10 +51,20 @@ from docling_core.types.doc.document import (
|
|
|
49
51
|
)
|
|
50
52
|
|
|
51
53
|
|
|
52
|
-
class
|
|
53
|
-
"""Markdown-specific
|
|
54
|
+
class MarkdownParams(CommonParams):
|
|
55
|
+
"""Markdown-specific serialization parameters."""
|
|
54
56
|
|
|
57
|
+
layers: set[ContentLayer] = {ContentLayer.BODY}
|
|
58
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
|
|
59
|
+
image_placeholder: str = "<!-- image -->"
|
|
60
|
+
indent: int = 4
|
|
55
61
|
wrap_width: Optional[PositiveInt] = None
|
|
62
|
+
page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
|
|
63
|
+
escape_underscores: bool = True
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
67
|
+
"""Markdown-specific text item serializer."""
|
|
56
68
|
|
|
57
69
|
@override
|
|
58
70
|
def serialize(
|
|
@@ -65,37 +77,47 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
65
77
|
**kwargs,
|
|
66
78
|
) -> SerializationResult:
|
|
67
79
|
"""Serializes the passed item."""
|
|
80
|
+
params = MarkdownParams(**kwargs)
|
|
81
|
+
parts: list[str] = []
|
|
68
82
|
escape_html = True
|
|
69
83
|
escape_underscores = True
|
|
70
84
|
if isinstance(item, TitleItem):
|
|
71
|
-
|
|
85
|
+
text = f"# {item.text}"
|
|
72
86
|
elif isinstance(item, SectionHeaderItem):
|
|
73
|
-
|
|
87
|
+
text = f"{(item.level + 1) * '#'} {item.text}"
|
|
74
88
|
elif isinstance(item, CodeItem):
|
|
75
|
-
|
|
89
|
+
text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
|
|
76
90
|
escape_html = False
|
|
77
91
|
escape_underscores = False
|
|
78
92
|
elif isinstance(item, FormulaItem):
|
|
79
93
|
if item.text:
|
|
80
|
-
|
|
94
|
+
text = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
|
|
81
95
|
elif item.orig:
|
|
82
|
-
|
|
96
|
+
text = "<!-- formula-not-decoded -->"
|
|
83
97
|
else:
|
|
84
|
-
|
|
98
|
+
text = ""
|
|
85
99
|
escape_html = False
|
|
86
100
|
escape_underscores = False
|
|
87
|
-
elif
|
|
88
|
-
|
|
101
|
+
elif params.wrap_width:
|
|
102
|
+
text = textwrap.fill(item.text, width=params.wrap_width)
|
|
89
103
|
else:
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
104
|
+
text = item.text
|
|
105
|
+
parts.append(text)
|
|
106
|
+
|
|
107
|
+
if isinstance(item, FloatingItem):
|
|
108
|
+
cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
|
|
109
|
+
if cap_text:
|
|
110
|
+
parts.append(cap_text)
|
|
111
|
+
|
|
112
|
+
text_res = (" " if is_inline_scope else "\n\n").join(parts)
|
|
113
|
+
text_res = doc_serializer.post_process(
|
|
114
|
+
text=text_res,
|
|
93
115
|
escape_html=escape_html,
|
|
94
116
|
escape_underscores=escape_underscores,
|
|
95
117
|
formatting=item.formatting,
|
|
96
118
|
hyperlink=item.hyperlink,
|
|
97
119
|
)
|
|
98
|
-
return SerializationResult(text=
|
|
120
|
+
return SerializationResult(text=text_res)
|
|
99
121
|
|
|
100
122
|
|
|
101
123
|
class MarkdownTableSerializer(BaseTableSerializer):
|
|
@@ -113,12 +135,14 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
113
135
|
"""Serializes the passed item."""
|
|
114
136
|
text_parts: list[str] = []
|
|
115
137
|
|
|
116
|
-
|
|
138
|
+
cap_res = doc_serializer.serialize_captions(
|
|
117
139
|
item=item,
|
|
118
|
-
|
|
119
|
-
|
|
140
|
+
**kwargs,
|
|
141
|
+
)
|
|
142
|
+
if cap_res.text:
|
|
143
|
+
text_parts.append(cap_res.text)
|
|
120
144
|
|
|
121
|
-
if item.self_ref not in doc_serializer.get_excluded_refs():
|
|
145
|
+
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
122
146
|
rows = [
|
|
123
147
|
[
|
|
124
148
|
# make sure that md tables are not broken
|
|
@@ -158,33 +182,26 @@ class MarkdownPictureSerializer(BasePictureSerializer):
|
|
|
158
182
|
item: PictureItem,
|
|
159
183
|
doc_serializer: BaseDocSerializer,
|
|
160
184
|
doc: DoclingDocument,
|
|
161
|
-
image_mode: Optional[ImageRefMode] = None,
|
|
162
|
-
image_placeholder: Optional[str] = None,
|
|
163
185
|
**kwargs,
|
|
164
186
|
) -> SerializationResult:
|
|
165
187
|
"""Serializes the passed item."""
|
|
166
|
-
|
|
167
|
-
image_mode if image_mode is not None else ImageRefMode.PLACEHOLDER
|
|
168
|
-
)
|
|
169
|
-
my_image_placeholder = (
|
|
170
|
-
image_placeholder if image_placeholder is not None else "<!-- image -->"
|
|
171
|
-
)
|
|
188
|
+
params = MarkdownParams(**kwargs)
|
|
172
189
|
|
|
173
190
|
texts: list[str] = []
|
|
174
191
|
|
|
175
192
|
cap_res = doc_serializer.serialize_captions(
|
|
176
193
|
item=item,
|
|
177
|
-
|
|
194
|
+
**kwargs,
|
|
178
195
|
)
|
|
179
196
|
if cap_res.text:
|
|
180
197
|
texts.append(cap_res.text)
|
|
181
198
|
|
|
182
|
-
if item.self_ref not in doc_serializer.get_excluded_refs():
|
|
199
|
+
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
183
200
|
img_res = self._serialize_image_part(
|
|
184
201
|
item=item,
|
|
185
202
|
doc=doc,
|
|
186
|
-
image_mode=
|
|
187
|
-
image_placeholder=
|
|
203
|
+
image_mode=params.image_mode,
|
|
204
|
+
image_placeholder=params.image_placeholder,
|
|
188
205
|
)
|
|
189
206
|
if img_res.text:
|
|
190
207
|
texts.append(img_res.text)
|
|
@@ -288,8 +305,6 @@ class MarkdownFormSerializer(BaseFormSerializer):
|
|
|
288
305
|
class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
289
306
|
"""Markdown-specific list serializer."""
|
|
290
307
|
|
|
291
|
-
indent: int = 4
|
|
292
|
-
|
|
293
308
|
@override
|
|
294
309
|
def serialize(
|
|
295
310
|
self,
|
|
@@ -303,12 +318,14 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
303
318
|
**kwargs,
|
|
304
319
|
) -> SerializationResult:
|
|
305
320
|
"""Serializes the passed item."""
|
|
321
|
+
params = MarkdownParams(**kwargs)
|
|
306
322
|
my_visited = visited or set()
|
|
307
323
|
parts = doc_serializer.get_parts(
|
|
308
|
-
|
|
324
|
+
item=item,
|
|
309
325
|
list_level=list_level + 1,
|
|
310
326
|
is_inline_scope=is_inline_scope,
|
|
311
327
|
visited=my_visited,
|
|
328
|
+
**kwargs,
|
|
312
329
|
)
|
|
313
330
|
sep = "\n"
|
|
314
331
|
my_parts: list[SerializationResult] = []
|
|
@@ -318,7 +335,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
|
|
|
318
335
|
else:
|
|
319
336
|
my_parts.append(p)
|
|
320
337
|
|
|
321
|
-
indent_str = list_level *
|
|
338
|
+
indent_str = list_level * params.indent * " "
|
|
322
339
|
is_ol = isinstance(item, OrderedList)
|
|
323
340
|
text_res = sep.join(
|
|
324
341
|
[
|
|
@@ -351,7 +368,7 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
|
|
|
351
368
|
"""Serializes the passed item."""
|
|
352
369
|
my_visited = visited or set()
|
|
353
370
|
parts = doc_serializer.get_parts(
|
|
354
|
-
|
|
371
|
+
item=item,
|
|
355
372
|
list_level=list_level,
|
|
356
373
|
is_inline_scope=True,
|
|
357
374
|
visited=my_visited,
|
|
@@ -393,6 +410,8 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
393
410
|
list_serializer: BaseListSerializer = MarkdownListSerializer()
|
|
394
411
|
inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
|
|
395
412
|
|
|
413
|
+
params: MarkdownParams = MarkdownParams()
|
|
414
|
+
|
|
396
415
|
@override
|
|
397
416
|
def serialize_bold(self, text: str, **kwargs):
|
|
398
417
|
"""Apply Markdown-specific bold serialization."""
|
|
@@ -450,7 +469,8 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
450
469
|
) -> str:
|
|
451
470
|
"""Apply some text post-processing steps."""
|
|
452
471
|
res = text
|
|
453
|
-
|
|
472
|
+
params = self.params.merge_with_patch(patch=kwargs)
|
|
473
|
+
if escape_underscores and params.escape_underscores:
|
|
454
474
|
res = self._escape_underscores(text)
|
|
455
475
|
if escape_html:
|
|
456
476
|
res = html.escape(res, quote=False)
|
|
@@ -462,8 +482,17 @@ class MarkdownDocSerializer(DocSerializer):
|
|
|
462
482
|
return res
|
|
463
483
|
|
|
464
484
|
@override
|
|
465
|
-
def
|
|
466
|
-
"""
|
|
467
|
-
|
|
468
|
-
text_res = "\n\n".join([p.text for p in parts if p.text])
|
|
485
|
+
def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
|
|
486
|
+
"""Serialize a page out of its parts."""
|
|
487
|
+
text_res = "\n\n".join([p.text for p in parts])
|
|
469
488
|
return SerializationResult(text=text_res)
|
|
489
|
+
|
|
490
|
+
@override
|
|
491
|
+
def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
|
|
492
|
+
"""Serialize a document out of its pages."""
|
|
493
|
+
if self.params.page_break_placeholder is not None:
|
|
494
|
+
sep = f"\n\n{self.params.page_break_placeholder}\n\n"
|
|
495
|
+
text_res = sep.join([p.text for p in pages if p.text])
|
|
496
|
+
return SerializationResult(text=text_res)
|
|
497
|
+
else:
|
|
498
|
+
return self.serialize_page(parts=pages)
|