docling-core 2.23.2__py3-none-any.whl → 2.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -26,12 +26,14 @@ from docling_core.experimental.serializer.base import (
26
26
  BaseTextSerializer,
27
27
  SerializationResult,
28
28
  )
29
- from docling_core.experimental.serializer.common import DocSerializer
29
+ from docling_core.experimental.serializer.common import CommonParams, DocSerializer
30
30
  from docling_core.types.doc.base import ImageRefMode
31
31
  from docling_core.types.doc.document import (
32
32
  CodeItem,
33
+ ContentLayer,
33
34
  DocItem,
34
35
  DoclingDocument,
36
+ FloatingItem,
35
37
  Formatting,
36
38
  FormItem,
37
39
  FormulaItem,
@@ -49,10 +51,20 @@ from docling_core.types.doc.document import (
49
51
  )
50
52
 
51
53
 
52
- class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
53
- """Markdown-specific text item serializer."""
54
+ class MarkdownParams(CommonParams):
55
+ """Markdown-specific serialization parameters."""
54
56
 
57
+ layers: set[ContentLayer] = {ContentLayer.BODY}
58
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
59
+ image_placeholder: str = "<!-- image -->"
60
+ indent: int = 4
55
61
  wrap_width: Optional[PositiveInt] = None
62
+ page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
63
+ escape_underscores: bool = True
64
+
65
+
66
+ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
67
+ """Markdown-specific text item serializer."""
56
68
 
57
69
  @override
58
70
  def serialize(
@@ -65,37 +77,47 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
65
77
  **kwargs,
66
78
  ) -> SerializationResult:
67
79
  """Serializes the passed item."""
80
+ params = MarkdownParams(**kwargs)
81
+ parts: list[str] = []
68
82
  escape_html = True
69
83
  escape_underscores = True
70
84
  if isinstance(item, TitleItem):
71
- res = f"# {item.text}"
85
+ text = f"# {item.text}"
72
86
  elif isinstance(item, SectionHeaderItem):
73
- res = f"{(item.level + 1) * '#'} {item.text}"
87
+ text = f"{(item.level + 1) * '#'} {item.text}"
74
88
  elif isinstance(item, CodeItem):
75
- res = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
89
+ text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
76
90
  escape_html = False
77
91
  escape_underscores = False
78
92
  elif isinstance(item, FormulaItem):
79
93
  if item.text:
80
- res = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
94
+ text = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
81
95
  elif item.orig:
82
- res = "<!-- formula-not-decoded -->"
96
+ text = "<!-- formula-not-decoded -->"
83
97
  else:
84
- res = ""
98
+ text = ""
85
99
  escape_html = False
86
100
  escape_underscores = False
87
- elif self.wrap_width:
88
- res = textwrap.fill(item.text, width=self.wrap_width)
101
+ elif params.wrap_width:
102
+ text = textwrap.fill(item.text, width=params.wrap_width)
89
103
  else:
90
- res = item.text
91
- res = doc_serializer.post_process(
92
- text=res,
104
+ text = item.text
105
+ parts.append(text)
106
+
107
+ if isinstance(item, FloatingItem):
108
+ cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
109
+ if cap_text:
110
+ parts.append(cap_text)
111
+
112
+ text_res = (" " if is_inline_scope else "\n\n").join(parts)
113
+ text_res = doc_serializer.post_process(
114
+ text=text_res,
93
115
  escape_html=escape_html,
94
116
  escape_underscores=escape_underscores,
95
117
  formatting=item.formatting,
96
118
  hyperlink=item.hyperlink,
97
119
  )
98
- return SerializationResult(text=res)
120
+ return SerializationResult(text=text_res)
99
121
 
100
122
 
101
123
  class MarkdownTableSerializer(BaseTableSerializer):
@@ -113,12 +135,14 @@ class MarkdownTableSerializer(BaseTableSerializer):
113
135
  """Serializes the passed item."""
114
136
  text_parts: list[str] = []
115
137
 
116
- if caption_txt := doc_serializer.serialize_captions(
138
+ cap_res = doc_serializer.serialize_captions(
117
139
  item=item,
118
- ).text:
119
- text_parts.append(caption_txt)
140
+ **kwargs,
141
+ )
142
+ if cap_res.text:
143
+ text_parts.append(cap_res.text)
120
144
 
121
- if item.self_ref not in doc_serializer.get_excluded_refs():
145
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
122
146
  rows = [
123
147
  [
124
148
  # make sure that md tables are not broken
@@ -158,33 +182,26 @@ class MarkdownPictureSerializer(BasePictureSerializer):
158
182
  item: PictureItem,
159
183
  doc_serializer: BaseDocSerializer,
160
184
  doc: DoclingDocument,
161
- image_mode: Optional[ImageRefMode] = None,
162
- image_placeholder: Optional[str] = None,
163
185
  **kwargs,
164
186
  ) -> SerializationResult:
165
187
  """Serializes the passed item."""
166
- my_image_mode = (
167
- image_mode if image_mode is not None else ImageRefMode.PLACEHOLDER
168
- )
169
- my_image_placeholder = (
170
- image_placeholder if image_placeholder is not None else "<!-- image -->"
171
- )
188
+ params = MarkdownParams(**kwargs)
172
189
 
173
190
  texts: list[str] = []
174
191
 
175
192
  cap_res = doc_serializer.serialize_captions(
176
193
  item=item,
177
- separator="\n",
194
+ **kwargs,
178
195
  )
179
196
  if cap_res.text:
180
197
  texts.append(cap_res.text)
181
198
 
182
- if item.self_ref not in doc_serializer.get_excluded_refs():
199
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
183
200
  img_res = self._serialize_image_part(
184
201
  item=item,
185
202
  doc=doc,
186
- image_mode=my_image_mode,
187
- image_placeholder=my_image_placeholder,
203
+ image_mode=params.image_mode,
204
+ image_placeholder=params.image_placeholder,
188
205
  )
189
206
  if img_res.text:
190
207
  texts.append(img_res.text)
@@ -288,8 +305,6 @@ class MarkdownFormSerializer(BaseFormSerializer):
288
305
  class MarkdownListSerializer(BaseModel, BaseListSerializer):
289
306
  """Markdown-specific list serializer."""
290
307
 
291
- indent: int = 4
292
-
293
308
  @override
294
309
  def serialize(
295
310
  self,
@@ -303,16 +318,26 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
303
318
  **kwargs,
304
319
  ) -> SerializationResult:
305
320
  """Serializes the passed item."""
321
+ params = MarkdownParams(**kwargs)
306
322
  my_visited = visited or set()
307
323
  parts = doc_serializer.get_parts(
308
- node=item,
324
+ item=item,
309
325
  list_level=list_level + 1,
310
326
  is_inline_scope=is_inline_scope,
311
327
  visited=my_visited,
328
+ **kwargs,
312
329
  )
313
- indent_str = list_level * self.indent * " "
330
+ sep = "\n"
331
+ my_parts: list[SerializationResult] = []
332
+ for p in parts:
333
+ if p.text and p.text[0] == " " and my_parts:
334
+ my_parts[-1].text = sep.join([my_parts[-1].text, p.text]) # update last
335
+ else:
336
+ my_parts.append(p)
337
+
338
+ indent_str = list_level * params.indent * " "
314
339
  is_ol = isinstance(item, OrderedList)
315
- text_res = "\n".join(
340
+ text_res = sep.join(
316
341
  [
317
342
  # avoid additional marker on already evaled sublists
318
343
  (
@@ -320,7 +345,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
320
345
  if c.text and c.text[0] == " "
321
346
  else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c.text}"
322
347
  )
323
- for i, c in enumerate(parts)
348
+ for i, c in enumerate(my_parts)
324
349
  ]
325
350
  )
326
351
  return SerializationResult(text=text_res)
@@ -343,7 +368,7 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
343
368
  """Serializes the passed item."""
344
369
  my_visited = visited or set()
345
370
  parts = doc_serializer.get_parts(
346
- node=item,
371
+ item=item,
347
372
  list_level=list_level,
348
373
  is_inline_scope=True,
349
374
  visited=my_visited,
@@ -385,6 +410,8 @@ class MarkdownDocSerializer(DocSerializer):
385
410
  list_serializer: BaseListSerializer = MarkdownListSerializer()
386
411
  inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
387
412
 
413
+ params: MarkdownParams = MarkdownParams()
414
+
388
415
  @override
389
416
  def serialize_bold(self, text: str, **kwargs):
390
417
  """Apply Markdown-specific bold serialization."""
@@ -442,7 +469,8 @@ class MarkdownDocSerializer(DocSerializer):
442
469
  ) -> str:
443
470
  """Apply some text post-processing steps."""
444
471
  res = text
445
- if escape_underscores and self.escape_underscores:
472
+ params = self.params.merge_with_patch(patch=kwargs)
473
+ if escape_underscores and params.escape_underscores:
446
474
  res = self._escape_underscores(text)
447
475
  if escape_html:
448
476
  res = html.escape(res, quote=False)
@@ -454,8 +482,17 @@ class MarkdownDocSerializer(DocSerializer):
454
482
  return res
455
483
 
456
484
  @override
457
- def serialize(self, **kwargs) -> SerializationResult:
458
- """Run the serialization."""
459
- parts = self.get_parts()
460
- text_res = "\n\n".join([p.text for p in parts if p.text])
485
+ def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
486
+ """Serialize a page out of its parts."""
487
+ text_res = "\n\n".join([p.text for p in parts])
461
488
  return SerializationResult(text=text_res)
489
+
490
+ @override
491
+ def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
492
+ """Serialize a document out of its pages."""
493
+ if self.params.page_break_placeholder is not None:
494
+ sep = f"\n\n{self.params.page_break_placeholder}\n\n"
495
+ text_res = sep.join([p.text for p in pages if p.text])
496
+ return SerializationResult(text=text_res)
497
+ else:
498
+ return self.serialize_page(parts=pages)