docling-core 2.23.3__py3-none-any.whl → 2.24.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -26,12 +26,14 @@ from docling_core.experimental.serializer.base import (
26
26
  BaseTextSerializer,
27
27
  SerializationResult,
28
28
  )
29
- from docling_core.experimental.serializer.common import DocSerializer
29
+ from docling_core.experimental.serializer.common import CommonParams, DocSerializer
30
30
  from docling_core.types.doc.base import ImageRefMode
31
31
  from docling_core.types.doc.document import (
32
32
  CodeItem,
33
+ ContentLayer,
33
34
  DocItem,
34
35
  DoclingDocument,
36
+ FloatingItem,
35
37
  Formatting,
36
38
  FormItem,
37
39
  FormulaItem,
@@ -49,10 +51,20 @@ from docling_core.types.doc.document import (
49
51
  )
50
52
 
51
53
 
52
- class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
53
- """Markdown-specific text item serializer."""
54
+ class MarkdownParams(CommonParams):
55
+ """Markdown-specific serialization parameters."""
54
56
 
57
+ layers: set[ContentLayer] = {ContentLayer.BODY}
58
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
59
+ image_placeholder: str = "<!-- image -->"
60
+ indent: int = 4
55
61
  wrap_width: Optional[PositiveInt] = None
62
+ page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
63
+ escape_underscores: bool = True
64
+
65
+
66
+ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
67
+ """Markdown-specific text item serializer."""
56
68
 
57
69
  @override
58
70
  def serialize(
@@ -65,37 +77,47 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
65
77
  **kwargs,
66
78
  ) -> SerializationResult:
67
79
  """Serializes the passed item."""
80
+ params = MarkdownParams(**kwargs)
81
+ parts: list[str] = []
68
82
  escape_html = True
69
83
  escape_underscores = True
70
84
  if isinstance(item, TitleItem):
71
- res = f"# {item.text}"
85
+ text = f"# {item.text}"
72
86
  elif isinstance(item, SectionHeaderItem):
73
- res = f"{(item.level + 1) * '#'} {item.text}"
87
+ text = f"{(item.level + 1) * '#'} {item.text}"
74
88
  elif isinstance(item, CodeItem):
75
- res = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
89
+ text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
76
90
  escape_html = False
77
91
  escape_underscores = False
78
92
  elif isinstance(item, FormulaItem):
79
93
  if item.text:
80
- res = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
94
+ text = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
81
95
  elif item.orig:
82
- res = "<!-- formula-not-decoded -->"
96
+ text = "<!-- formula-not-decoded -->"
83
97
  else:
84
- res = ""
98
+ text = ""
85
99
  escape_html = False
86
100
  escape_underscores = False
87
- elif self.wrap_width:
88
- res = textwrap.fill(item.text, width=self.wrap_width)
101
+ elif params.wrap_width:
102
+ text = textwrap.fill(item.text, width=params.wrap_width)
89
103
  else:
90
- res = item.text
91
- res = doc_serializer.post_process(
92
- text=res,
104
+ text = item.text
105
+ parts.append(text)
106
+
107
+ if isinstance(item, FloatingItem):
108
+ cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
109
+ if cap_text:
110
+ parts.append(cap_text)
111
+
112
+ text_res = (" " if is_inline_scope else "\n\n").join(parts)
113
+ text_res = doc_serializer.post_process(
114
+ text=text_res,
93
115
  escape_html=escape_html,
94
116
  escape_underscores=escape_underscores,
95
117
  formatting=item.formatting,
96
118
  hyperlink=item.hyperlink,
97
119
  )
98
- return SerializationResult(text=res)
120
+ return SerializationResult(text=text_res)
99
121
 
100
122
 
101
123
  class MarkdownTableSerializer(BaseTableSerializer):
@@ -113,12 +135,14 @@ class MarkdownTableSerializer(BaseTableSerializer):
113
135
  """Serializes the passed item."""
114
136
  text_parts: list[str] = []
115
137
 
116
- if caption_txt := doc_serializer.serialize_captions(
138
+ cap_res = doc_serializer.serialize_captions(
117
139
  item=item,
118
- ).text:
119
- text_parts.append(caption_txt)
140
+ **kwargs,
141
+ )
142
+ if cap_res.text:
143
+ text_parts.append(cap_res.text)
120
144
 
121
- if item.self_ref not in doc_serializer.get_excluded_refs():
145
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
122
146
  rows = [
123
147
  [
124
148
  # make sure that md tables are not broken
@@ -158,33 +182,26 @@ class MarkdownPictureSerializer(BasePictureSerializer):
158
182
  item: PictureItem,
159
183
  doc_serializer: BaseDocSerializer,
160
184
  doc: DoclingDocument,
161
- image_mode: Optional[ImageRefMode] = None,
162
- image_placeholder: Optional[str] = None,
163
185
  **kwargs,
164
186
  ) -> SerializationResult:
165
187
  """Serializes the passed item."""
166
- my_image_mode = (
167
- image_mode if image_mode is not None else ImageRefMode.PLACEHOLDER
168
- )
169
- my_image_placeholder = (
170
- image_placeholder if image_placeholder is not None else "<!-- image -->"
171
- )
188
+ params = MarkdownParams(**kwargs)
172
189
 
173
190
  texts: list[str] = []
174
191
 
175
192
  cap_res = doc_serializer.serialize_captions(
176
193
  item=item,
177
- separator="\n",
194
+ **kwargs,
178
195
  )
179
196
  if cap_res.text:
180
197
  texts.append(cap_res.text)
181
198
 
182
- if item.self_ref not in doc_serializer.get_excluded_refs():
199
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
183
200
  img_res = self._serialize_image_part(
184
201
  item=item,
185
202
  doc=doc,
186
- image_mode=my_image_mode,
187
- image_placeholder=my_image_placeholder,
203
+ image_mode=params.image_mode,
204
+ image_placeholder=params.image_placeholder,
188
205
  )
189
206
  if img_res.text:
190
207
  texts.append(img_res.text)
@@ -288,8 +305,6 @@ class MarkdownFormSerializer(BaseFormSerializer):
288
305
  class MarkdownListSerializer(BaseModel, BaseListSerializer):
289
306
  """Markdown-specific list serializer."""
290
307
 
291
- indent: int = 4
292
-
293
308
  @override
294
309
  def serialize(
295
310
  self,
@@ -303,12 +318,14 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
303
318
  **kwargs,
304
319
  ) -> SerializationResult:
305
320
  """Serializes the passed item."""
321
+ params = MarkdownParams(**kwargs)
306
322
  my_visited = visited or set()
307
323
  parts = doc_serializer.get_parts(
308
- node=item,
324
+ item=item,
309
325
  list_level=list_level + 1,
310
326
  is_inline_scope=is_inline_scope,
311
327
  visited=my_visited,
328
+ **kwargs,
312
329
  )
313
330
  sep = "\n"
314
331
  my_parts: list[SerializationResult] = []
@@ -318,7 +335,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
318
335
  else:
319
336
  my_parts.append(p)
320
337
 
321
- indent_str = list_level * self.indent * " "
338
+ indent_str = list_level * params.indent * " "
322
339
  is_ol = isinstance(item, OrderedList)
323
340
  text_res = sep.join(
324
341
  [
@@ -351,7 +368,7 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
351
368
  """Serializes the passed item."""
352
369
  my_visited = visited or set()
353
370
  parts = doc_serializer.get_parts(
354
- node=item,
371
+ item=item,
355
372
  list_level=list_level,
356
373
  is_inline_scope=True,
357
374
  visited=my_visited,
@@ -393,6 +410,8 @@ class MarkdownDocSerializer(DocSerializer):
393
410
  list_serializer: BaseListSerializer = MarkdownListSerializer()
394
411
  inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
395
412
 
413
+ params: MarkdownParams = MarkdownParams()
414
+
396
415
  @override
397
416
  def serialize_bold(self, text: str, **kwargs):
398
417
  """Apply Markdown-specific bold serialization."""
@@ -450,7 +469,8 @@ class MarkdownDocSerializer(DocSerializer):
450
469
  ) -> str:
451
470
  """Apply some text post-processing steps."""
452
471
  res = text
453
- if escape_underscores and self.escape_underscores:
472
+ params = self.params.merge_with_patch(patch=kwargs)
473
+ if escape_underscores and params.escape_underscores:
454
474
  res = self._escape_underscores(text)
455
475
  if escape_html:
456
476
  res = html.escape(res, quote=False)
@@ -462,8 +482,17 @@ class MarkdownDocSerializer(DocSerializer):
462
482
  return res
463
483
 
464
484
  @override
465
- def serialize(self, **kwargs) -> SerializationResult:
466
- """Run the serialization."""
467
- parts = self.get_parts()
468
- text_res = "\n\n".join([p.text for p in parts if p.text])
485
+ def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
486
+ """Serialize a page out of its parts."""
487
+ text_res = "\n\n".join([p.text for p in parts])
469
488
  return SerializationResult(text=text_res)
489
+
490
+ @override
491
+ def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
492
+ """Serialize a document out of its pages."""
493
+ if self.params.page_break_placeholder is not None:
494
+ sep = f"\n\n{self.params.page_break_placeholder}\n\n"
495
+ text_res = sep.join([p.text for p in pages if p.text])
496
+ return SerializationResult(text=text_res)
497
+ else:
498
+ return self.serialize_page(parts=pages)