docling-core 2.24.1__py3-none-any.whl → 2.26.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -0,0 +1,212 @@
1
+ """HTML styles for different export modes."""
2
+
3
+
4
+ def _get_css_with_no_styling() -> str:
5
+ """Return default CSS styles for the HTML document."""
6
+ return "<style></style>"
7
+
8
+
9
+ def _get_css_for_split_page() -> str:
10
+ """Return default CSS styles for the HTML document."""
11
+ return """<style>
12
+ html {
13
+ background-color: #e1e1e1;
14
+ font-family: Arial, sans-serif;
15
+ line-height: 1.6;
16
+ }
17
+ img {
18
+ min-width: 500px;
19
+ max-width: 100%;
20
+ }
21
+ table {
22
+ border-collapse: collapse;
23
+ border: 0px solid #fff;
24
+ width: 100%;
25
+ }
26
+ td {
27
+ vertical-align: top;
28
+ }
29
+ .page {
30
+ background-color: white;
31
+ margin-top:15px;
32
+ padding: 30px;
33
+ border: 1px solid black;
34
+ width:100%;
35
+ max-width:1000px;
36
+ box-shadow: 0 0 10px rgba(0,0,0,0.5);
37
+ }
38
+ .page figure {
39
+ text-align: center;
40
+ }
41
+ .page img {
42
+ max-width: 900px;
43
+ min-width: auto;
44
+ }
45
+ .page table {
46
+ border-collapse: collapse;
47
+ margin: 1em 0;
48
+ width: 100%;
49
+ }
50
+ .page table td {
51
+ border: 1px solid #ddd;
52
+ padding: 8px;
53
+ text-align: left;
54
+ }
55
+ .page table th {
56
+ border: 1px solid #ddd;
57
+ padding: 8px;
58
+ text-align: left;
59
+ background-color: #f2f2f2;
60
+ font-weight: bold;
61
+ }
62
+ .page table caption {
63
+ color: #666;
64
+ font-style: italic;
65
+ margin-top: 0.5em;
66
+ padding: 8px;
67
+ margin-top: 5px;
68
+ margin-bottom: 5px;
69
+ }
70
+ .page figcaption {
71
+ color: #666;
72
+ font-style: italic;
73
+ margin-top: 0.5em;
74
+ padding: 8px;
75
+ margin-top: 5px;
76
+ margin-bottom: 5px;
77
+ }
78
+ code {
79
+ background-color: rgb(228, 228, 228);
80
+ border: 1px solid darkgray;
81
+ padding: 10px;
82
+ display: inline-block;
83
+ font-family: monospace;
84
+ max-width:980px;
85
+ word-wrap: normal;
86
+ white-space: pre-wrap;
87
+ word-wrap: break-word;
88
+ /*overflow-wrap: break-word;*/
89
+ }
90
+ </style>
91
+ """
92
+
93
+
94
+ def _get_css_for_single_column() -> str:
95
+ """Return CSS styles for the single-column HTML document."""
96
+ return """<style>
97
+ html {
98
+ background-color: #f5f5f5;
99
+ font-family: Arial, sans-serif;
100
+ line-height: 1.6;
101
+ }
102
+ body {
103
+ max-width: 800px;
104
+ margin: 0 auto;
105
+ padding: 2rem;
106
+ background-color: white;
107
+ box-shadow: 0 0 10px rgba(0,0,0,0.1);
108
+ }
109
+ h1, h2, h3, h4, h5, h6 {
110
+ color: #333;
111
+ margin-top: 1.5em;
112
+ margin-bottom: 0.5em;
113
+ }
114
+ h1 {
115
+ font-size: 2em;
116
+ border-bottom: 1px solid #eee;
117
+ padding-bottom: 0.3em;
118
+ }
119
+ table {
120
+ border-collapse: collapse;
121
+ margin: 1em 0;
122
+ width: 100%;
123
+ }
124
+ th, td {
125
+ border: 1px solid #ddd;
126
+ padding: 8px;
127
+ text-align: left;
128
+ }
129
+ th {
130
+ background-color: #f2f2f2;
131
+ font-weight: bold;
132
+ }
133
+ figure {
134
+ margin: 1.5em 0;
135
+ text-align: center;
136
+ }
137
+ figcaption {
138
+ color: #666;
139
+ font-style: italic;
140
+ margin-top: 0.5em;
141
+ }
142
+ img {
143
+ max-width: 100%;
144
+ height: auto;
145
+ }
146
+ pre {
147
+ background-color: #f6f8fa;
148
+ border-radius: 3px;
149
+ padding: 1em;
150
+ overflow: auto;
151
+ }
152
+ code {
153
+ font-family: monospace;
154
+ background-color: #f6f8fa;
155
+ padding: 0.2em 0.4em;
156
+ border-radius: 3px;
157
+ }
158
+ pre code {
159
+ background-color: transparent;
160
+ padding: 0;
161
+ }
162
+ .formula {
163
+ text-align: center;
164
+ padding: 0.5em;
165
+ margin: 1em 0;
166
+ background-color: #f9f9f9;
167
+ }
168
+ .formula-not-decoded {
169
+ text-align: center;
170
+ padding: 0.5em;
171
+ margin: 1em 0;
172
+ background: repeating-linear-gradient(
173
+ 45deg,
174
+ #f0f0f0,
175
+ #f0f0f0 10px,
176
+ #f9f9f9 10px,
177
+ #f9f9f9 20px
178
+ );
179
+ }
180
+ .page-break {
181
+ page-break-after: always;
182
+ border-top: 1px dashed #ccc;
183
+ margin: 2em 0;
184
+ }
185
+ .key-value-region {
186
+ background-color: #f9f9f9;
187
+ padding: 1em;
188
+ border-radius: 4px;
189
+ margin: 1em 0;
190
+ }
191
+ .key-value-region dt {
192
+ font-weight: bold;
193
+ }
194
+ .key-value-region dd {
195
+ margin-left: 1em;
196
+ margin-bottom: 0.5em;
197
+ }
198
+ .form-container {
199
+ border: 1px solid #ddd;
200
+ padding: 1em;
201
+ border-radius: 4px;
202
+ margin: 1em 0;
203
+ }
204
+ .form-item {
205
+ margin-bottom: 0.5em;
206
+ }
207
+ .image-classification {
208
+ font-size: 0.9em;
209
+ color: #666;
210
+ margin-top: 0.5em;
211
+ }
212
+ </style>"""
@@ -26,7 +26,11 @@ from docling_core.experimental.serializer.base import (
26
26
  BaseTextSerializer,
27
27
  SerializationResult,
28
28
  )
29
- from docling_core.experimental.serializer.common import CommonParams, DocSerializer
29
+ from docling_core.experimental.serializer.common import (
30
+ CommonParams,
31
+ DocSerializer,
32
+ create_ser_result,
33
+ )
30
34
  from docling_core.types.doc.base import ImageRefMode
31
35
  from docling_core.types.doc.document import (
32
36
  CodeItem,
@@ -43,6 +47,7 @@ from docling_core.types.doc.document import (
43
47
  NodeItem,
44
48
  OrderedList,
45
49
  PictureItem,
50
+ PictureTabularChartData,
46
51
  SectionHeaderItem,
47
52
  TableItem,
48
53
  TextItem,
@@ -57,10 +62,12 @@ class MarkdownParams(CommonParams):
57
62
  layers: set[ContentLayer] = {ContentLayer.BODY}
58
63
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
59
64
  image_placeholder: str = "<!-- image -->"
65
+ enable_chart_tables: bool = True
60
66
  indent: int = 4
61
67
  wrap_width: Optional[PositiveInt] = None
62
68
  page_break_placeholder: Optional[str] = None # e.g. "<!-- page break -->"
63
69
  escape_underscores: bool = True
70
+ escape_html: bool = True
64
71
 
65
72
 
66
73
  class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
@@ -78,46 +85,51 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
78
85
  ) -> SerializationResult:
79
86
  """Serializes the passed item."""
80
87
  params = MarkdownParams(**kwargs)
81
- parts: list[str] = []
88
+ res_parts: list[SerializationResult] = []
82
89
  escape_html = True
83
90
  escape_underscores = True
84
91
  if isinstance(item, TitleItem):
85
- text = f"# {item.text}"
92
+ text_part = f"# {item.text}"
86
93
  elif isinstance(item, SectionHeaderItem):
87
- text = f"{(item.level + 1) * '#'} {item.text}"
94
+ text_part = f"{(item.level + 1) * '#'} {item.text}"
88
95
  elif isinstance(item, CodeItem):
89
- text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
96
+ text_part = (
97
+ f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
98
+ )
90
99
  escape_html = False
91
100
  escape_underscores = False
92
101
  elif isinstance(item, FormulaItem):
93
102
  if item.text:
94
- text = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
103
+ text_part = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
95
104
  elif item.orig:
96
- text = "<!-- formula-not-decoded -->"
105
+ text_part = "<!-- formula-not-decoded -->"
97
106
  else:
98
- text = ""
107
+ text_part = ""
99
108
  escape_html = False
100
109
  escape_underscores = False
101
110
  elif params.wrap_width:
102
- text = textwrap.fill(item.text, width=params.wrap_width)
111
+ text_part = textwrap.fill(item.text, width=params.wrap_width)
103
112
  else:
104
- text = item.text
105
- parts.append(text)
113
+ text_part = item.text
114
+
115
+ if text_part:
116
+ text_res = create_ser_result(text=text_part, span_source=item)
117
+ res_parts.append(text_res)
106
118
 
107
119
  if isinstance(item, FloatingItem):
108
- cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
109
- if cap_text:
110
- parts.append(cap_text)
120
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
121
+ if cap_res.text:
122
+ res_parts.append(cap_res)
111
123
 
112
- text_res = (" " if is_inline_scope else "\n\n").join(parts)
113
- text_res = doc_serializer.post_process(
114
- text=text_res,
124
+ text = (" " if is_inline_scope else "\n\n").join([r.text for r in res_parts])
125
+ text = doc_serializer.post_process(
126
+ text=text,
115
127
  escape_html=escape_html,
116
128
  escape_underscores=escape_underscores,
117
129
  formatting=item.formatting,
118
130
  hyperlink=item.hyperlink,
119
131
  )
120
- return SerializationResult(text=text_res)
132
+ return create_ser_result(text=text, span_source=res_parts)
121
133
 
122
134
 
123
135
  class MarkdownTableSerializer(BaseTableSerializer):
@@ -133,14 +145,14 @@ class MarkdownTableSerializer(BaseTableSerializer):
133
145
  **kwargs,
134
146
  ) -> SerializationResult:
135
147
  """Serializes the passed item."""
136
- text_parts: list[str] = []
148
+ res_parts: list[SerializationResult] = []
137
149
 
138
150
  cap_res = doc_serializer.serialize_captions(
139
151
  item=item,
140
152
  **kwargs,
141
153
  )
142
154
  if cap_res.text:
143
- text_parts.append(cap_res.text)
155
+ res_parts.append(cap_res)
144
156
 
145
157
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
146
158
  rows = [
@@ -165,11 +177,11 @@ class MarkdownTableSerializer(BaseTableSerializer):
165
177
  else:
166
178
  table_text = ""
167
179
  if table_text:
168
- text_parts.append(table_text)
180
+ res_parts.append(create_ser_result(text=table_text, span_source=item))
169
181
 
170
- text_res = "\n\n".join(text_parts)
182
+ text_res = "\n\n".join([r.text for r in res_parts])
171
183
 
172
- return SerializationResult(text=text_res)
184
+ return create_ser_result(text=text_res, span_source=res_parts)
173
185
 
174
186
 
175
187
  class MarkdownPictureSerializer(BasePictureSerializer):
@@ -187,14 +199,14 @@ class MarkdownPictureSerializer(BasePictureSerializer):
187
199
  """Serializes the passed item."""
188
200
  params = MarkdownParams(**kwargs)
189
201
 
190
- texts: list[str] = []
202
+ res_parts: list[SerializationResult] = []
191
203
 
192
204
  cap_res = doc_serializer.serialize_captions(
193
205
  item=item,
194
206
  **kwargs,
195
207
  )
196
208
  if cap_res.text:
197
- texts.append(cap_res.text)
209
+ res_parts.append(cap_res)
198
210
 
199
211
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
200
212
  img_res = self._serialize_image_part(
@@ -204,11 +216,28 @@ class MarkdownPictureSerializer(BasePictureSerializer):
204
216
  image_placeholder=params.image_placeholder,
205
217
  )
206
218
  if img_res.text:
207
- texts.append(img_res.text)
208
-
209
- text_res = "\n\n".join(texts)
219
+ res_parts.append(img_res)
220
+
221
+ if params.enable_chart_tables:
222
+ # Check if picture has attached PictureTabularChartData
223
+ tabular_chart_annotations = [
224
+ ann
225
+ for ann in item.annotations
226
+ if isinstance(ann, PictureTabularChartData)
227
+ ]
228
+ if len(tabular_chart_annotations) > 0:
229
+ temp_doc = DoclingDocument(name="temp")
230
+ temp_table = temp_doc.add_table(
231
+ data=tabular_chart_annotations[0].chart_data
232
+ )
233
+ md_table_content = temp_table.export_to_markdown(temp_doc)
234
+ if len(md_table_content) > 0:
235
+ res_parts.append(
236
+ create_ser_result(text=md_table_content, span_source=item)
237
+ )
238
+ text_res = "\n\n".join([r.text for r in res_parts])
210
239
 
211
- return SerializationResult(text=text_res)
240
+ return create_ser_result(text=text_res, span_source=res_parts)
212
241
 
213
242
  def _serialize_image_part(
214
243
  self,
@@ -255,7 +284,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
255
284
  else:
256
285
  text_res = image_placeholder
257
286
 
258
- return SerializationResult(text=text_res)
287
+ return create_ser_result(text=text_res, span_source=item)
259
288
 
260
289
 
261
290
  class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
@@ -272,12 +301,13 @@ class MarkdownKeyValueSerializer(BaseKeyValueSerializer):
272
301
  ) -> SerializationResult:
273
302
  """Serializes the passed item."""
274
303
  # TODO add actual implementation
275
- text_res = (
276
- "<!-- missing-key-value-item -->"
277
- if item.self_ref not in doc_serializer.get_excluded_refs()
278
- else ""
279
- )
280
- return SerializationResult(text=text_res)
304
+ if item.self_ref not in doc_serializer.get_excluded_refs():
305
+ return create_ser_result(
306
+ text="<!-- missing-key-value-item -->",
307
+ span_source=item,
308
+ )
309
+ else:
310
+ return create_ser_result()
281
311
 
282
312
 
283
313
  class MarkdownFormSerializer(BaseFormSerializer):
@@ -294,12 +324,13 @@ class MarkdownFormSerializer(BaseFormSerializer):
294
324
  ) -> SerializationResult:
295
325
  """Serializes the passed item."""
296
326
  # TODO add actual implementation
297
- text_res = (
298
- "<!-- missing-form-item -->"
299
- if item.self_ref not in doc_serializer.get_excluded_refs()
300
- else ""
301
- )
302
- return SerializationResult(text=text_res)
327
+ if item.self_ref not in doc_serializer.get_excluded_refs():
328
+ return create_ser_result(
329
+ text="<!-- missing-form-item -->",
330
+ span_source=item,
331
+ )
332
+ else:
333
+ return create_ser_result()
303
334
 
304
335
 
305
336
  class MarkdownListSerializer(BaseModel, BaseListSerializer):
@@ -319,7 +350,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
319
350
  ) -> SerializationResult:
320
351
  """Serializes the passed item."""
321
352
  params = MarkdownParams(**kwargs)
322
- my_visited = visited or set()
353
+ my_visited = visited if visited is not None else set()
323
354
  parts = doc_serializer.get_parts(
324
355
  item=item,
325
356
  list_level=list_level + 1,
@@ -332,6 +363,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
332
363
  for p in parts:
333
364
  if p.text and p.text[0] == " " and my_parts:
334
365
  my_parts[-1].text = sep.join([my_parts[-1].text, p.text]) # update last
366
+ my_parts[-1].spans.extend(p.spans)
335
367
  else:
336
368
  my_parts.append(p)
337
369
 
@@ -348,7 +380,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
348
380
  for i, c in enumerate(my_parts)
349
381
  ]
350
382
  )
351
- return SerializationResult(text=text_res)
383
+ return create_ser_result(text=text_res, span_source=my_parts)
352
384
 
353
385
 
354
386
  class MarkdownInlineSerializer(BaseInlineSerializer):
@@ -366,7 +398,7 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
366
398
  **kwargs,
367
399
  ) -> SerializationResult:
368
400
  """Serializes the passed item."""
369
- my_visited = visited or set()
401
+ my_visited = visited if visited is not None else set()
370
402
  parts = doc_serializer.get_parts(
371
403
  item=item,
372
404
  list_level=list_level,
@@ -374,7 +406,7 @@ class MarkdownInlineSerializer(BaseInlineSerializer):
374
406
  visited=my_visited,
375
407
  )
376
408
  text_res = " ".join([p.text for p in parts if p.text])
377
- return SerializationResult(text=text_res)
409
+ return create_ser_result(text=text_res, span_source=parts)
378
410
 
379
411
 
380
412
  class MarkdownFallbackSerializer(BaseFallbackSerializer):
@@ -391,10 +423,12 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
391
423
  ) -> SerializationResult:
392
424
  """Serializes the passed item."""
393
425
  if isinstance(item, DocItem):
394
- text_res = "<!-- missing-text -->"
426
+ return create_ser_result(
427
+ text="<!-- missing-text -->",
428
+ span_source=item,
429
+ )
395
430
  else:
396
- text_res = "" # TODO go with explicit None return type?
397
- return SerializationResult(text=text_res)
431
+ return create_ser_result()
398
432
 
399
433
 
400
434
  class MarkdownDocSerializer(DocSerializer):
@@ -472,7 +506,7 @@ class MarkdownDocSerializer(DocSerializer):
472
506
  params = self.params.merge_with_patch(patch=kwargs)
473
507
  if escape_underscores and params.escape_underscores:
474
508
  res = self._escape_underscores(text)
475
- if escape_html:
509
+ if escape_html and params.escape_html:
476
510
  res = html.escape(res, quote=False)
477
511
  res = super().post_process(
478
512
  text=res,
@@ -482,17 +516,21 @@ class MarkdownDocSerializer(DocSerializer):
482
516
  return res
483
517
 
484
518
  @override
485
- def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
519
+ def serialize_page(
520
+ self, *, parts: list[SerializationResult], **kwargs
521
+ ) -> SerializationResult:
486
522
  """Serialize a page out of its parts."""
487
- text_res = "\n\n".join([p.text for p in parts])
488
- return SerializationResult(text=text_res)
523
+ text_res = "\n\n".join([p.text for p in parts if p.text])
524
+ return create_ser_result(text=text_res, span_source=parts)
489
525
 
490
526
  @override
491
- def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
527
+ def serialize_doc(
528
+ self, *, pages: dict[Optional[int], SerializationResult], **kwargs
529
+ ) -> SerializationResult:
492
530
  """Serialize a document out of its pages."""
493
531
  if self.params.page_break_placeholder is not None:
494
532
  sep = f"\n\n{self.params.page_break_placeholder}\n\n"
495
- text_res = sep.join([p.text for p in pages if p.text])
496
- return SerializationResult(text=text_res)
533
+ text_res = sep.join([text for k in pages if (text := pages[k].text)])
534
+ return create_ser_result(text=text_res, span_source=list(pages.values()))
497
535
  else:
498
- return self.serialize_page(parts=pages)
536
+ return self.serialize_page(parts=list(pages.values()))
@@ -9,6 +9,7 @@ from abc import ABC, abstractmethod
9
9
  from typing import Any, ClassVar, Iterator
10
10
 
11
11
  from pydantic import BaseModel
12
+ from typing_extensions import deprecated
12
13
 
13
14
  from docling_core.types.doc import DoclingDocument as DLDocument
14
15
 
@@ -65,8 +66,8 @@ class BaseChunker(BaseModel, ABC):
65
66
  """
66
67
  raise NotImplementedError()
67
68
 
68
- def serialize(self, chunk: BaseChunk) -> str:
69
- """Serialize the given chunk. This base implementation is embedding-targeted.
69
+ def contextualize(self, chunk: BaseChunk) -> str:
70
+ """Contextualize the given chunk. This implementation is embedding-targeted.
70
71
 
71
72
  Args:
72
73
  chunk: chunk to serialize
@@ -93,3 +94,8 @@ class BaseChunker(BaseModel, ABC):
93
94
  items.append(chunk.text)
94
95
 
95
96
  return self.delim.join(items)
97
+
98
+ @deprecated("Use contextualize() instead.")
99
+ def serialize(self, chunk: BaseChunk) -> str:
100
+ """Contextualize the given chunk. This implementation is embedding-targeted."""
101
+ return self.contextualize(chunk=chunk)