docling-core 2.25.0__py3-none-any.whl → 2.26.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -0,0 +1,212 @@
1
+ """HTML styles for different export modes."""
2
+
3
+
4
+ def _get_css_with_no_styling() -> str:
5
+ """Return default CSS styles for the HTML document."""
6
+ return "<style></style>"
7
+
8
+
9
+ def _get_css_for_split_page() -> str:
10
+ """Return default CSS styles for the HTML document."""
11
+ return """<style>
12
+ html {
13
+ background-color: #e1e1e1;
14
+ font-family: Arial, sans-serif;
15
+ line-height: 1.6;
16
+ }
17
+ img {
18
+ min-width: 500px;
19
+ max-width: 100%;
20
+ }
21
+ table {
22
+ border-collapse: collapse;
23
+ border: 0px solid #fff;
24
+ width: 100%;
25
+ }
26
+ td {
27
+ vertical-align: top;
28
+ }
29
+ .page {
30
+ background-color: white;
31
+ margin-top:15px;
32
+ padding: 30px;
33
+ border: 1px solid black;
34
+ width:100%;
35
+ max-width:1000px;
36
+ box-shadow: 0 0 10px rgba(0,0,0,0.5);
37
+ }
38
+ .page figure {
39
+ text-align: center;
40
+ }
41
+ .page img {
42
+ max-width: 900px;
43
+ min-width: auto;
44
+ }
45
+ .page table {
46
+ border-collapse: collapse;
47
+ margin: 1em 0;
48
+ width: 100%;
49
+ }
50
+ .page table td {
51
+ border: 1px solid #ddd;
52
+ padding: 8px;
53
+ text-align: left;
54
+ }
55
+ .page table th {
56
+ border: 1px solid #ddd;
57
+ padding: 8px;
58
+ text-align: left;
59
+ background-color: #f2f2f2;
60
+ font-weight: bold;
61
+ }
62
+ .page table caption {
63
+ color: #666;
64
+ font-style: italic;
65
+ margin-top: 0.5em;
66
+ padding: 8px;
67
+ margin-top: 5px;
68
+ margin-bottom: 5px;
69
+ }
70
+ .page figcaption {
71
+ color: #666;
72
+ font-style: italic;
73
+ margin-top: 0.5em;
74
+ padding: 8px;
75
+ margin-top: 5px;
76
+ margin-bottom: 5px;
77
+ }
78
+ code {
79
+ background-color: rgb(228, 228, 228);
80
+ border: 1px solid darkgray;
81
+ padding: 10px;
82
+ display: inline-block;
83
+ font-family: monospace;
84
+ max-width:980px;
85
+ word-wrap: normal;
86
+ white-space: pre-wrap;
87
+ word-wrap: break-word;
88
+ /*overflow-wrap: break-word;*/
89
+ }
90
+ </style>
91
+ """
92
+
93
+
94
+ def _get_css_for_single_column() -> str:
95
+ """Return CSS styles for the single-column HTML document."""
96
+ return """<style>
97
+ html {
98
+ background-color: #f5f5f5;
99
+ font-family: Arial, sans-serif;
100
+ line-height: 1.6;
101
+ }
102
+ body {
103
+ max-width: 800px;
104
+ margin: 0 auto;
105
+ padding: 2rem;
106
+ background-color: white;
107
+ box-shadow: 0 0 10px rgba(0,0,0,0.1);
108
+ }
109
+ h1, h2, h3, h4, h5, h6 {
110
+ color: #333;
111
+ margin-top: 1.5em;
112
+ margin-bottom: 0.5em;
113
+ }
114
+ h1 {
115
+ font-size: 2em;
116
+ border-bottom: 1px solid #eee;
117
+ padding-bottom: 0.3em;
118
+ }
119
+ table {
120
+ border-collapse: collapse;
121
+ margin: 1em 0;
122
+ width: 100%;
123
+ }
124
+ th, td {
125
+ border: 1px solid #ddd;
126
+ padding: 8px;
127
+ text-align: left;
128
+ }
129
+ th {
130
+ background-color: #f2f2f2;
131
+ font-weight: bold;
132
+ }
133
+ figure {
134
+ margin: 1.5em 0;
135
+ text-align: center;
136
+ }
137
+ figcaption {
138
+ color: #666;
139
+ font-style: italic;
140
+ margin-top: 0.5em;
141
+ }
142
+ img {
143
+ max-width: 100%;
144
+ height: auto;
145
+ }
146
+ pre {
147
+ background-color: #f6f8fa;
148
+ border-radius: 3px;
149
+ padding: 1em;
150
+ overflow: auto;
151
+ }
152
+ code {
153
+ font-family: monospace;
154
+ background-color: #f6f8fa;
155
+ padding: 0.2em 0.4em;
156
+ border-radius: 3px;
157
+ }
158
+ pre code {
159
+ background-color: transparent;
160
+ padding: 0;
161
+ }
162
+ .formula {
163
+ text-align: center;
164
+ padding: 0.5em;
165
+ margin: 1em 0;
166
+ background-color: #f9f9f9;
167
+ }
168
+ .formula-not-decoded {
169
+ text-align: center;
170
+ padding: 0.5em;
171
+ margin: 1em 0;
172
+ background: repeating-linear-gradient(
173
+ 45deg,
174
+ #f0f0f0,
175
+ #f0f0f0 10px,
176
+ #f9f9f9 10px,
177
+ #f9f9f9 20px
178
+ );
179
+ }
180
+ .page-break {
181
+ page-break-after: always;
182
+ border-top: 1px dashed #ccc;
183
+ margin: 2em 0;
184
+ }
185
+ .key-value-region {
186
+ background-color: #f9f9f9;
187
+ padding: 1em;
188
+ border-radius: 4px;
189
+ margin: 1em 0;
190
+ }
191
+ .key-value-region dt {
192
+ font-weight: bold;
193
+ }
194
+ .key-value-region dd {
195
+ margin-left: 1em;
196
+ margin-bottom: 0.5em;
197
+ }
198
+ .form-container {
199
+ border: 1px solid #ddd;
200
+ padding: 1em;
201
+ border-radius: 4px;
202
+ margin: 1em 0;
203
+ }
204
+ .form-item {
205
+ margin-bottom: 0.5em;
206
+ }
207
+ .image-classification {
208
+ font-size: 0.9em;
209
+ color: #666;
210
+ margin-top: 0.5em;
211
+ }
212
+ </style>"""
@@ -26,7 +26,12 @@ from docling_core.experimental.serializer.base import (
26
26
  BaseTextSerializer,
27
27
  SerializationResult,
28
28
  )
29
- from docling_core.experimental.serializer.common import CommonParams, DocSerializer
29
+ from docling_core.experimental.serializer.common import (
30
+ CommonParams,
31
+ DocSerializer,
32
+ _PageBreakSerResult,
33
+ create_ser_result,
34
+ )
30
35
  from docling_core.types.doc.base import ImageRefMode
31
36
  from docling_core.types.doc.document import (
32
37
  CodeItem,
@@ -43,6 +48,7 @@ from docling_core.types.doc.document import (
43
48
  NodeItem,
44
49
  OrderedList,
45
50
  PictureItem,
51
+ PictureTabularChartData,
46
52
  SectionHeaderItem,
47
53
  TableItem,
48
54
  TextItem,
@@ -57,10 +63,12 @@ class MarkdownParams(CommonParams):
57
63
  layers: set[ContentLayer] = {ContentLayer.BODY}
58
64
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
59
65
  image_placeholder: str = "<!-- image -->"
66
+ enable_chart_tables: bool = True
60
67
  indent: int = 4
61
68
  wrap_width: Optional[PositiveInt] = None
62
69
  page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
63
70
  escape_underscores: bool = True
71
+ escape_html: bool = True
64
72
 
65
73
 
66
74
  class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
@@ -78,46 +86,51 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
78
86
  ) -> SerializationResult:
79
87
  """Serializes the passed item."""
80
88
  params = MarkdownParams(**kwargs)
81
- parts: list[str] = []
89
+ res_parts: list[SerializationResult] = []
82
90
  escape_html = True
83
91
  escape_underscores = True
84
92
  if isinstance(item, TitleItem):
85
- text = f"# {item.text}"
93
+ text_part = f"# {item.text}"
86
94
  elif isinstance(item, SectionHeaderItem):
87
- text = f"{(item.level + 1) * '#'} {item.text}"
95
+ text_part = f"{(item.level + 1) * '#'} {item.text}"
88
96
  elif isinstance(item, CodeItem):
89
- text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
97
+ text_part = (
98
+ f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
99
+ )
90
100
  escape_html = False
91
101
  escape_underscores = False
92
102
  elif isinstance(item, FormulaItem):
93
103
  if item.text:
94
- text = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
104
+ text_part = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
95
105
  elif item.orig:
96
- text = "<!-- formula-not-decoded -->"
106
+ text_part = "<!-- formula-not-decoded -->"
97
107
  else:
98
- text = ""
108
+ text_part = ""
99
109
  escape_html = False
100
110
  escape_underscores = False
101
111
  elif params.wrap_width:
102
- text = textwrap.fill(item.text, width=params.wrap_width)
112
+ text_part = textwrap.fill(item.text, width=params.wrap_width)
103
113
  else:
104
- text = item.text
105
- parts.append(text)
114
+ text_part = item.text
115
+
116
+ if text_part:
117
+ text_res = create_ser_result(text=text_part, span_source=item)
118
+ res_parts.append(text_res)
106
119
 
107
120
  if isinstance(item, FloatingItem):
108
- cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
109
- if cap_text:
110
- parts.append(cap_text)
121
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
122
+ if cap_res.text:
123
+ res_parts.append(cap_res)
111
124
 
112
- text_res = (" " if is_inline_scope else "\n\n").join(parts)
113
- text_res = doc_serializer.post_process(
114
- text=text_res,
125
+ text = (" " if is_inline_scope else "\n\n").join([r.text for r in res_parts])
126
+ text = doc_serializer.post_process(
127
+ text=text,
115
128
  escape_html=escape_html,
116
129
  escape_underscores=escape_underscores,
117
130
  formatting=item.formatting,
118
131
  hyperlink=item.hyperlink,
119
132
  )
120
- return SerializationResult(text=text_res)
133
+ return create_ser_result(text=text, span_source=res_parts)
121
134
 
122
135
 
123
136
  class MarkdownTableSerializer(BaseTableSerializer):
@@ -133,14 +146,14 @@ class MarkdownTableSerializer(BaseTableSerializer):
133
146
  **kwargs,
134
147
  ) -> SerializationResult:
135
148
  """Serializes the passed item."""
136
- text_parts: list[str] = []
149
+ res_parts: list[SerializationResult] = []
137
150
 
138
151
  cap_res = doc_serializer.serialize_captions(
139
152
  item=item,
140
153
  **kwargs,
141
154
  )
142
155
  if cap_res.text:
143
- text_parts.append(cap_res.text)
156
+ res_parts.append(cap_res)
144
157
 
145
158
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
146
159
  rows = [
@@ -165,11 +178,11 @@ class MarkdownTableSerializer(BaseTableSerializer):
165
178
  else:
166
179
  table_text = ""
167
180
  if table_text:
168
- text_parts.append(table_text)
181
+ res_parts.append(create_ser_result(text=table_text, span_source=item))
169
182
 
170
- text_res = "\n\n".join(text_parts)
183
+ text_res = "\n\n".join([r.text for r in res_parts])
171
184
 
172
- return SerializationResult(text=text_res)
185
+ return create_ser_result(text=text_res, span_source=res_parts)
173
186
 
174
187
 
175
188
  class MarkdownPictureSerializer(BasePictureSerializer):
@@ -187,14 +200,14 @@ class MarkdownPictureSerializer(BasePictureSerializer):
187
200
  """Serializes the passed item."""
188
201
  params = MarkdownParams(**kwargs)
189
202
 
190
- texts: list[str] = []
203
+ res_parts: list[SerializationResult] = []
191
204
 
192
205
  cap_res = doc_serializer.serialize_captions(
193
206
  item=item,
194
207
  **kwargs,
195
208
  )
196
209
  if cap_res.text:
197
- texts.append(cap_res.text)
210
+ res_parts.append(cap_res)
198
211
 
199
212
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
200
213
  img_res = self._serialize_image_part(
@@ -204,11 +217,28 @@ class MarkdownPictureSerializer(BasePictureSerializer):
204
217
  image_placeholder=params.image_placeholder,
205
218
  )
206
219
  if img_res.text:
207
- texts.append(img_res.text)
208
-
209
- text_res = "\n\n".join(texts)
220
+ res_parts.append(img_res)
221
+
222
+ if params.enable_chart_tables:
223
+ # Check if picture has attached PictureTabularChartData
224
+ tabular_chart_annotations = [
225
+ ann
226
+ for ann in item.annotations
227
+ if isinstance(ann, PictureTabularChartData)
228
+ ]
229
+ if len(tabular_chart_annotations) > 0:
230
+ temp_doc = DoclingDocument(name="temp")
231
+ temp_table = temp_doc.add_table(
232
+ data=tabular_chart_annotations[0].chart_data
233
+ )
234
+ md_table_content = temp_table.export_to_markdown(temp_doc)
235
+ if len(md_table_content) > 0:
236
+ res_parts.append(
237
+ create_ser_result(text=md_table_content, span_source=item)
238
+ )
239
+ text_res = "\n\n".join([r.text for r in res_parts])
210
240
 
211
- return SerializationResult(text=text_res)
241
+ return create_ser_result(text=text_res, span_source=res_parts)
212
242
 
213
243
  def _serialize_image_part(
214
244
  self,
@@ -255,7 +285,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
255
285
  else:
256
286
  text_res = image_placeholder
257
287
 
258
- return SerializationResult(text=text_res)
288
+ return create_ser_result(text=text_res, span_source=item)
259
289
 
260
290
 
261
291
  class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
@@ -272,12 +302,13 @@ class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
272
302
  ) -> SerializationResult:
273
303
  """Serializes the passed item."""
274
304
  # TODO add actual implementation
275
- text_res = (
276
- "<!-- missing-key-value-item -->"
277
- if item.self_ref not in doc_serializer.get_excluded_refs()
278
- else ""
279
- )
280
- return SerializationResult(text=text_res)
305
+ if item.self_ref not in doc_serializer.get_excluded_refs():
306
+ return create_ser_result(
307
+ text="<!-- missing-key-value-item -->",
308
+ span_source=item,
309
+ )
310
+ else:
311
+ return create_ser_result()
281
312
 
282
313
 
283
314
  class MarkdownFormSerializer(BaseFormSerializer):
@@ -294,12 +325,13 @@ class MarkdownFormSerializer(BaseFormSerializer):
294
325
  ) -> SerializationResult:
295
326
  """Serializes the passed item."""
296
327
  # TODO add actual implementation
297
- text_res = (
298
- "<!-- missing-form-item -->"
299
- if item.self_ref not in doc_serializer.get_excluded_refs()
300
- else ""
301
- )
302
- return SerializationResult(text=text_res)
328
+ if item.self_ref not in doc_serializer.get_excluded_refs():
329
+ return create_ser_result(
330
+ text="<!-- missing-form-item -->",
331
+ span_source=item,
332
+ )
333
+ else:
334
+ return create_ser_result()
303
335
 
304
336
 
305
337
  class MarkdownListSerializer(BaseModel, BaseListSerializer):
@@ -319,7 +351,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
319
351
  ) -> SerializationResult:
320
352
  """Serializes the passed item."""
321
353
  params = MarkdownParams(**kwargs)
322
- my_visited = visited or set()
354
+ my_visited = visited if visited is not None else set()
323
355
  parts = doc_serializer.get_parts(
324
356
  item=item,
325
357
  list_level=list_level + 1,
@@ -332,6 +364,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
332
364
  for p in parts:
333
365
  if p.text and p.text[0] == " " and my_parts:
334
366
  my_parts[-1].text = sep.join([my_parts[-1].text, p.text]) # update last
367
+ my_parts[-1].spans.extend(p.spans)
335
368
  else:
336
369
  my_parts.append(p)
337
370
 
@@ -343,12 +376,16 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
343
376
  (
344
377
  c.text
345
378
  if c.text and c.text[0] == " "
346
- else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c.text}"
379
+ else (
380
+ f"{indent_str}"
381
+ f"{'' if isinstance(c, _PageBreakSerResult) else (f'{i + 1}. ' if is_ol else '- ')}" # noqa: E501
382
+ f"{c.text}"
383
+ )
347
384
  )
348
385
  for i, c in enumerate(my_parts)
349
386
  ]
350
387
  )
351
- return SerializationResult(text=text_res)
388
+ return create_ser_result(text=text_res, span_source=my_parts)
352
389
 
353
390
 
354
391
  class MarkdownInlineSerializer(BaseInlineSerializer):
@@ -366,15 +403,16 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
366
403
  **kwargs,
367
404
  ) -> SerializationResult:
368
405
  """Serializes the passed item."""
369
- my_visited = visited or set()
406
+ my_visited = visited if visited is not None else set()
370
407
  parts = doc_serializer.get_parts(
371
408
  item=item,
372
409
  list_level=list_level,
373
410
  is_inline_scope=True,
374
411
  visited=my_visited,
412
+ **kwargs,
375
413
  )
376
414
  text_res = " ".join([p.text for p in parts if p.text])
377
- return SerializationResult(text=text_res)
415
+ return create_ser_result(text=text_res, span_source=parts)
378
416
 
379
417
 
380
418
  class MarkdownFallbackSerializer(BaseFallbackSerializer):
@@ -391,10 +429,12 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
391
429
  ) -> SerializationResult:
392
430
  """Serializes the passed item."""
393
431
  if isinstance(item, DocItem):
394
- text_res = "<!-- missing-text -->"
432
+ return create_ser_result(
433
+ text="<!-- missing-text -->",
434
+ span_source=item,
435
+ )
395
436
  else:
396
- text_res = "" # TODO go with explicit None return type?
397
- return SerializationResult(text=text_res)
437
+ return create_ser_result()
398
438
 
399
439
 
400
440
  class MarkdownDocSerializer(DocSerializer):
@@ -472,7 +512,7 @@ class MarkdownDocSerializer(DocSerializer):
472
512
  params = self.params.merge_with_patch(patch=kwargs)
473
513
  if escape_underscores and params.escape_underscores:
474
514
  res = self._escape_underscores(text)
475
- if escape_html:
515
+ if escape_html and params.escape_html:
476
516
  res = html.escape(res, quote=False)
477
517
  res = super().post_process(
478
518
  text=res,
@@ -482,17 +522,19 @@ class MarkdownDocSerializer(DocSerializer):
482
522
  return res
483
523
 
484
524
  @override
485
- def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
486
- """Serialize a page out of its parts."""
487
- text_res = "\n\n".join([p.text for p in parts])
488
- return SerializationResult(text=text_res)
525
+ def serialize_doc(
526
+ self, *, parts: list[SerializationResult], **kwargs
527
+ ) -> SerializationResult:
528
+ """Serialize a document out of its parts."""
529
+ text_res = "\n\n".join([p.text for p in parts if p.text])
530
+ if self.params.page_break_placeholder:
531
+ page_sep = self.params.page_break_placeholder or ""
532
+ for full_match, _, _ in self._get_page_breaks(text=text_res):
533
+ text_res = text_res.replace(full_match, page_sep)
534
+
535
+ return create_ser_result(text=text_res, span_source=parts)
489
536
 
490
537
  @override
491
- def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
492
- """Serialize a document out of its pages."""
493
- if self.params.page_break_placeholder is not None:
494
- sep = f"\n\n{self.params.page_break_placeholder}\n\n"
495
- text_res = sep.join([p.text for p in pages if p.text])
496
- return SerializationResult(text=text_res)
497
- else:
498
- return self.serialize_page(parts=pages)
538
+ def requires_page_break(self):
539
+ """Whether to add page breaks."""
540
+ return self.params.page_break_placeholder is not None
@@ -9,6 +9,7 @@ from abc import ABC, abstractmethod
9
9
  from typing import Any, ClassVar, Iterator
10
10
 
11
11
  from pydantic import BaseModel
12
+ from typing_extensions import deprecated
12
13
 
13
14
  from docling_core.types.doc import DoclingDocument as DLDocument
14
15
 
@@ -65,8 +66,8 @@ class BaseChunker(BaseModel, ABC):
65
66
  """
66
67
  raise NotImplementedError()
67
68
 
68
- def serialize(self, chunk: BaseChunk) -> str:
69
- """Serialize the given chunk. This base implementation is embedding-targeted.
69
+ def contextualize(self, chunk: BaseChunk) -> str:
70
+ """Contextualize the given chunk. This implementation is embedding-targeted.
70
71
 
71
72
  Args:
72
73
  chunk: chunk to serialize
@@ -93,3 +94,8 @@ class BaseChunker(BaseModel, ABC):
93
94
  items.append(chunk.text)
94
95
 
95
96
  return self.delim.join(items)
97
+
98
+ @deprecated("Use contextualize() instead.")
99
+ def serialize(self, chunk: BaseChunk) -> str:
100
+ """Contextualize the given chunk. This implementation is embedding-targeted."""
101
+ return self.contextualize(chunk=chunk)