docling-core 2.25.0__py3-none-any.whl → 2.26.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -0,0 +1,941 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2025
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define classes for HTML serialization."""
7
+ import base64
8
+ import html
9
+ import logging
10
+ from enum import Enum
11
+ from io import BytesIO
12
+ from pathlib import Path
13
+ from typing import Optional, Union
14
+ from urllib.parse import quote
15
+ from xml.etree.cElementTree import SubElement, tostring
16
+ from xml.sax.saxutils import unescape
17
+
18
+ import latex2mathml.converter
19
+ from pydantic import AnyUrl, BaseModel
20
+ from typing_extensions import override
21
+
22
+ from docling_core.experimental.serializer.base import (
23
+ BaseDocSerializer,
24
+ BaseFallbackSerializer,
25
+ BaseFormSerializer,
26
+ BaseInlineSerializer,
27
+ BaseKeyValueSerializer,
28
+ BaseListSerializer,
29
+ BasePictureSerializer,
30
+ BaseTableSerializer,
31
+ BaseTextSerializer,
32
+ SerializationResult,
33
+ )
34
+ from docling_core.experimental.serializer.common import (
35
+ CommonParams,
36
+ DocSerializer,
37
+ create_ser_result,
38
+ )
39
+ from docling_core.experimental.serializer.html_styles import (
40
+ _get_css_for_single_column,
41
+ _get_css_for_split_page,
42
+ )
43
+ from docling_core.types.doc.base import ImageRefMode
44
+ from docling_core.types.doc.document import (
45
+ CodeItem,
46
+ ContentLayer,
47
+ DocItem,
48
+ DoclingDocument,
49
+ FloatingItem,
50
+ FormItem,
51
+ FormulaItem,
52
+ GraphData,
53
+ ImageRef,
54
+ InlineGroup,
55
+ KeyValueItem,
56
+ ListItem,
57
+ NodeItem,
58
+ OrderedList,
59
+ PictureItem,
60
+ SectionHeaderItem,
61
+ TableCell,
62
+ TableItem,
63
+ TextItem,
64
+ TitleItem,
65
+ UnorderedList,
66
+ )
67
+ from docling_core.types.doc.labels import DocItemLabel
68
+ from docling_core.types.doc.utils import (
69
+ get_html_tag_with_text_direction,
70
+ get_text_direction,
71
+ )
72
+
73
+ _logger = logging.getLogger(__name__)
74
+
75
+
76
+ class HTMLOutputStyle(str, Enum):
77
+ """HTML output style."""
78
+
79
+ SINGLE_COLUMN = "single_column"
80
+ SPLIT_PAGE = "split_page"
81
+
82
+
83
+ class HTMLParams(CommonParams):
84
+ """HTML-specific serialization parameters."""
85
+
86
+ # Default layers to use for HTML export
87
+ layers: set[ContentLayer] = {ContentLayer.BODY}
88
+
89
+ # How to handle images
90
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
91
+
92
+ # HTML document properties
93
+ html_lang: str = "en"
94
+ html_head: Optional[str] = None
95
+
96
+ css_styles: Optional[str] = None
97
+
98
+ add_document_metadata: bool = True
99
+ prettify: bool = True # Add indentation and line breaks
100
+
101
+ # Formula rendering options
102
+ formula_to_mathml: bool = True
103
+
104
+ # Allow for different output styles
105
+ output_style: HTMLOutputStyle = HTMLOutputStyle.SINGLE_COLUMN
106
+
107
+
108
+ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
109
+ """HTML-specific text item serializer."""
110
+
111
+ @override
112
+ def serialize(
113
+ self,
114
+ *,
115
+ item: TextItem,
116
+ doc_serializer: BaseDocSerializer,
117
+ doc: DoclingDocument,
118
+ is_inline_scope: bool = False,
119
+ **kwargs,
120
+ ) -> SerializationResult:
121
+ """Serializes the passed text item to HTML."""
122
+ params = HTMLParams(**kwargs)
123
+ res_parts: list[SerializationResult] = []
124
+
125
+ # Prepare the HTML based on item type
126
+ if isinstance(item, TitleItem):
127
+ text_inner = self._prepare_content(item.text)
128
+ text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
129
+
130
+ elif isinstance(item, SectionHeaderItem):
131
+ section_level = min(item.level + 1, 6)
132
+ text_inner = self._prepare_content(item.text)
133
+ text = get_html_tag_with_text_direction(
134
+ html_tag=f"h{section_level}", text=text_inner
135
+ )
136
+
137
+ elif isinstance(item, FormulaItem):
138
+ text = self._process_formula(
139
+ item=item,
140
+ doc=doc,
141
+ image_mode=params.image_mode,
142
+ formula_to_mathml=params.formula_to_mathml,
143
+ is_inline_scope=is_inline_scope,
144
+ )
145
+
146
+ elif isinstance(item, CodeItem):
147
+ text = self._process_code(item=item, is_inline_scope=is_inline_scope)
148
+
149
+ elif isinstance(item, ListItem):
150
+ # List items are handled by list serializer
151
+ text_inner = self._prepare_content(item.text)
152
+ text = get_html_tag_with_text_direction(html_tag="li", text=text_inner)
153
+
154
+ elif is_inline_scope:
155
+ text = self._prepare_content(item.text)
156
+ else:
157
+ # Regular text item
158
+ text_inner = self._prepare_content(item.text)
159
+ text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
160
+
161
+ # Apply formatting and hyperlinks
162
+ text = doc_serializer.post_process(
163
+ text=text,
164
+ formatting=item.formatting,
165
+ hyperlink=item.hyperlink,
166
+ )
167
+
168
+ if text:
169
+ text_res = create_ser_result(text=text, span_source=item)
170
+ res_parts.append(text_res)
171
+
172
+ if isinstance(item, FloatingItem):
173
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
174
+ if cap_res.text:
175
+ res_parts.append(cap_res)
176
+
177
+ return create_ser_result(text=text, span_source=res_parts)
178
+
179
+ def _prepare_content(
180
+ self, text: str, do_escape_html=True, do_replace_newline=True
181
+ ) -> str:
182
+ """Prepare text content for HTML inclusion."""
183
+ if do_escape_html:
184
+ text = html.escape(text, quote=False)
185
+ if do_replace_newline:
186
+ text = text.replace("\n", "<br>")
187
+ return text
188
+
189
+ def _process_code(
190
+ self,
191
+ item: CodeItem,
192
+ is_inline_scope: bool,
193
+ ) -> str:
194
+ code_text = self._prepare_content(
195
+ item.text, do_escape_html=True, do_replace_newline=False
196
+ )
197
+ if is_inline_scope:
198
+ text = f"<code>{code_text}</code>"
199
+ else:
200
+ text = f"<pre><code>{code_text}</code></pre>"
201
+
202
+ return text
203
+
204
+ def _process_formula(
205
+ self,
206
+ item: FormulaItem,
207
+ doc: DoclingDocument,
208
+ image_mode: ImageRefMode,
209
+ formula_to_mathml: bool,
210
+ is_inline_scope: bool,
211
+ ) -> str:
212
+ """Process a formula item to HTML/MathML."""
213
+ math_formula = self._prepare_content(
214
+ item.text, do_escape_html=False, do_replace_newline=False
215
+ )
216
+
217
+ # If formula is empty, try to use an image fallback
218
+ if item.text == "" and item.orig != "":
219
+ img_fallback = self._get_formula_image_fallback(item, doc)
220
+ if (
221
+ image_mode == ImageRefMode.EMBEDDED
222
+ and len(item.prov) > 0
223
+ and img_fallback
224
+ ):
225
+ return img_fallback
226
+
227
+ # Try to generate MathML
228
+ if formula_to_mathml and math_formula:
229
+ try:
230
+ # Set display mode based on context
231
+ display_mode = "inline" if is_inline_scope else "block"
232
+ mathml_element = latex2mathml.converter.convert_to_element(
233
+ math_formula, display=display_mode
234
+ )
235
+ annotation = SubElement(
236
+ mathml_element, "annotation", dict(encoding="TeX")
237
+ )
238
+ annotation.text = math_formula
239
+ mathml = unescape(tostring(mathml_element, encoding="unicode"))
240
+
241
+ # Don't wrap in div for inline formulas
242
+ if is_inline_scope:
243
+ return mathml
244
+ else:
245
+ return f"<div>{mathml}</div>"
246
+
247
+ except Exception:
248
+ img_fallback = self._get_formula_image_fallback(item, doc)
249
+ if (
250
+ image_mode == ImageRefMode.EMBEDDED
251
+ and len(item.prov) > 0
252
+ and img_fallback
253
+ ):
254
+ return img_fallback
255
+ elif math_formula:
256
+ return f"<pre>{math_formula}</pre>"
257
+ else:
258
+ return "<pre>Formula not decoded</pre>"
259
+
260
+ _logger.warning("Could not parse formula with MathML")
261
+
262
+ # Fallback options if we got here
263
+ if math_formula and is_inline_scope:
264
+ return f"<code>{math_formula}</code>"
265
+ elif math_formula and (not is_inline_scope):
266
+ f"<pre>{math_formula}</pre>"
267
+ elif is_inline_scope:
268
+ return '<span class="formula-not-decoded">Formula not decoded</span>'
269
+
270
+ return '<div class="formula-not-decoded">Formula not decoded</div>'
271
+
272
+ def _get_formula_image_fallback(
273
+ self, item: TextItem, doc: DoclingDocument
274
+ ) -> Optional[str]:
275
+ """Try to get an image fallback for a formula."""
276
+ item_image = item.get_image(doc=doc)
277
+ if item_image is not None:
278
+ img_ref = ImageRef.from_pil(item_image, dpi=72)
279
+ return (
280
+ "<figure>" f'<img src="{img_ref.uri}" alt="{item.orig}" />' "</figure>"
281
+ )
282
+ return None
283
+
284
+
285
+ class HTMLTableSerializer(BaseTableSerializer):
286
+ """HTML-specific table item serializer."""
287
+
288
+ @override
289
+ def serialize(
290
+ self,
291
+ *,
292
+ item: TableItem,
293
+ doc_serializer: BaseDocSerializer,
294
+ doc: DoclingDocument,
295
+ **kwargs,
296
+ ) -> SerializationResult:
297
+ """Serializes the passed table item to HTML."""
298
+ nrows = item.data.num_rows
299
+ ncols = item.data.num_cols
300
+
301
+ res_parts: list[SerializationResult] = []
302
+ cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
303
+ if cap_res.text:
304
+ res_parts.append(cap_res)
305
+
306
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
307
+ body = ""
308
+
309
+ for i in range(nrows):
310
+ body += "<tr>"
311
+ for j in range(ncols):
312
+ cell: TableCell = item.data.grid[i][j]
313
+
314
+ rowspan, rowstart = (
315
+ cell.row_span,
316
+ cell.start_row_offset_idx,
317
+ )
318
+ colspan, colstart = (
319
+ cell.col_span,
320
+ cell.start_col_offset_idx,
321
+ )
322
+
323
+ if rowstart != i:
324
+ continue
325
+ if colstart != j:
326
+ continue
327
+
328
+ content = html.escape(cell.text.strip())
329
+ celltag = "td"
330
+ if cell.column_header:
331
+ celltag = "th"
332
+
333
+ opening_tag = f"{celltag}"
334
+ if rowspan > 1:
335
+ opening_tag += f' rowspan="{rowspan}"'
336
+ if colspan > 1:
337
+ opening_tag += f' colspan="{colspan}"'
338
+
339
+ text_dir = get_text_direction(content)
340
+ if text_dir == "rtl":
341
+ opening_tag += f' dir="{dir}"'
342
+
343
+ body += f"<{opening_tag}>{content}</{celltag}>"
344
+ body += "</tr>"
345
+
346
+ if body:
347
+ body = f"<tbody>{body}</tbody>"
348
+ res_parts.append(create_ser_result(text=body, span_source=item))
349
+
350
+ text_res = "".join([r.text for r in res_parts])
351
+ text_res = f"<table>{text_res}</table>" if text_res else ""
352
+
353
+ return create_ser_result(text=text_res, span_source=res_parts)
354
+
355
+
356
+ class HTMLPictureSerializer(BasePictureSerializer):
357
+ """HTML-specific picture item serializer."""
358
+
359
+ @override
360
+ def serialize(
361
+ self,
362
+ *,
363
+ item: PictureItem,
364
+ doc_serializer: BaseDocSerializer,
365
+ doc: DoclingDocument,
366
+ **kwargs,
367
+ ) -> SerializationResult:
368
+ """Export picture to HTML format."""
369
+ params = HTMLParams(**kwargs)
370
+
371
+ res_parts: list[SerializationResult] = []
372
+
373
+ cap_res = doc_serializer.serialize_captions(
374
+ item=item,
375
+ tag="figcaption",
376
+ **kwargs,
377
+ )
378
+ if cap_res.text:
379
+ res_parts.append(cap_res)
380
+
381
+ img_text = ""
382
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
383
+
384
+ if params.image_mode == ImageRefMode.EMBEDDED:
385
+ # short-cut: we already have the image in base64
386
+ if (
387
+ isinstance(item.image, ImageRef)
388
+ and isinstance(item.image.uri, AnyUrl)
389
+ and item.image.uri.scheme == "data"
390
+ ):
391
+ img_text = f'<img src="{item.image.uri}">'
392
+ else:
393
+ # get the item.image._pil or crop it out of the page-image
394
+ img = item.get_image(doc)
395
+
396
+ if img is not None:
397
+ imgb64 = item._image_to_base64(img)
398
+ img_text = f'<img src="data:image/png;base64,{imgb64}">'
399
+ elif params.image_mode == ImageRefMode.REFERENCED:
400
+ if isinstance(item.image, ImageRef) and not (
401
+ isinstance(item.image.uri, AnyUrl)
402
+ and item.image.uri.scheme == "data"
403
+ ):
404
+ img_text = f'<img src="{quote(str(item.image.uri))}">'
405
+ if img_text:
406
+ res_parts.append(create_ser_result(text=img_text, span_source=item))
407
+
408
+ text_res = "".join([r.text for r in res_parts])
409
+ if text_res:
410
+ text_res = f"<figure>{text_res}</figure>"
411
+
412
+ return create_ser_result(text=text_res, span_source=res_parts)
413
+
414
+
415
+ class _HTMLGraphDataSerializer:
416
+ """HTML-specific graph-data item serializer."""
417
+
418
+ def serialize(
419
+ self,
420
+ *,
421
+ item: Union[FormItem, KeyValueItem],
422
+ graph_data: GraphData,
423
+ class_name: str,
424
+ ) -> SerializationResult:
425
+ """Serialize the graph-data to HTML."""
426
+ # Build cell lookup by ID
427
+ cell_map = {cell.cell_id: cell for cell in graph_data.cells}
428
+
429
+ # Build relationship maps
430
+ child_links: dict[int, list[int]] = (
431
+ {}
432
+ ) # source_id -> list of child_ids (to_child)
433
+ value_links: dict[int, list[int]] = {} # key_id -> list of value_ids (to_value)
434
+ parents: set[int] = (
435
+ set()
436
+ ) # Set of all IDs that are targets of to_child (to find roots)
437
+
438
+ for link in graph_data.links:
439
+ if (
440
+ link.source_cell_id not in cell_map
441
+ or link.target_cell_id not in cell_map
442
+ ):
443
+ continue
444
+
445
+ if link.label.value == "to_child":
446
+ child_links.setdefault(link.source_cell_id, []).append(
447
+ link.target_cell_id
448
+ )
449
+ parents.add(link.target_cell_id)
450
+ elif link.label.value == "to_value":
451
+ value_links.setdefault(link.source_cell_id, []).append(
452
+ link.target_cell_id
453
+ )
454
+
455
+ # Find root cells (cells with no parent)
456
+ root_ids = [cell_id for cell_id in cell_map.keys() if cell_id not in parents]
457
+
458
+ # Generate the HTML
459
+ parts = [f'<div class="{class_name}">']
460
+
461
+ # If we have roots, make a list structure
462
+ if root_ids:
463
+ parts.append(f'<ul class="{class_name}">')
464
+ for root_id in root_ids:
465
+ parts.append(
466
+ self._render_cell_tree(
467
+ cell_id=root_id,
468
+ cell_map=cell_map,
469
+ child_links=child_links,
470
+ value_links=value_links,
471
+ level=0,
472
+ )
473
+ )
474
+ parts.append("</ul>")
475
+
476
+ # If no hierarchy, fall back to definition list
477
+ else:
478
+ parts.append(f'<dl class="{class_name}">')
479
+ for key_id, value_ids in value_links.items():
480
+ key_cell = cell_map[key_id]
481
+ key_text = html.escape(key_cell.text)
482
+ parts.append(f"<dt>{key_text}</dt>")
483
+
484
+ for value_id in value_ids:
485
+ value_cell = cell_map[value_id]
486
+ value_text = html.escape(value_cell.text)
487
+ parts.append(f"<dd>{value_text}</dd>")
488
+ parts.append("</dl>")
489
+
490
+ parts.append("</div>")
491
+
492
+ return create_ser_result(text="\n".join(parts), span_source=item)
493
+
494
+ def _render_cell_tree(
495
+ self,
496
+ cell_id: int,
497
+ cell_map: dict,
498
+ child_links: dict,
499
+ value_links: dict,
500
+ level: int,
501
+ ) -> str:
502
+ """Recursively render a cell and its children as a nested list."""
503
+ cell = cell_map[cell_id]
504
+ cell_text = html.escape(cell.text)
505
+
506
+ # Format key-value pairs if this cell has values linked
507
+ if cell_id in value_links:
508
+ value_texts = []
509
+ for value_id in value_links[cell_id]:
510
+ if value_id in cell_map:
511
+ value_cell = cell_map[value_id]
512
+ value_texts.append(html.escape(value_cell.text))
513
+
514
+ cell_text = f"<strong>{cell_text}</strong>: {', '.join(value_texts)}"
515
+
516
+ # If this cell has children, create a nested list
517
+ if cell_id in child_links and child_links[cell_id]:
518
+ children_html = []
519
+ children_html.append(f"<li>{cell_text}</li>")
520
+ children_html.append("<ul>")
521
+
522
+ for child_id in child_links[cell_id]:
523
+ children_html.append(
524
+ self._render_cell_tree(
525
+ cell_id=child_id,
526
+ cell_map=cell_map,
527
+ child_links=child_links,
528
+ value_links=value_links,
529
+ level=level + 1,
530
+ )
531
+ )
532
+
533
+ children_html.append("</ul>")
534
+ return "\n".join(children_html)
535
+
536
+ elif cell_id in value_links:
537
+ return f"<li>{cell_text}</li>"
538
+ else:
539
+ # Leaf node - just render the cell
540
+ # return f'<li>{cell_text}</li>'
541
+ return ""
542
+
543
+
544
+ class HTMLKeyValueSerializer(BaseKeyValueSerializer):
545
+ """HTML-specific key-value item serializer."""
546
+
547
+ @override
548
+ def serialize(
549
+ self,
550
+ *,
551
+ item: KeyValueItem,
552
+ doc_serializer: "BaseDocSerializer",
553
+ doc: DoclingDocument,
554
+ **kwargs,
555
+ ) -> SerializationResult:
556
+ """Serializes the passed key-value item to HTML."""
557
+ res_parts: list[SerializationResult] = []
558
+
559
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
560
+ graph_serializer = _HTMLGraphDataSerializer()
561
+
562
+ # Add key-value if available
563
+ kv_res = graph_serializer.serialize(
564
+ item=item,
565
+ graph_data=item.graph,
566
+ class_name="key-value-region",
567
+ )
568
+ if kv_res.text:
569
+ res_parts.append(kv_res)
570
+
571
+ # Add caption if available
572
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
573
+ if cap_res.text:
574
+ res_parts.append(cap_res)
575
+
576
+ text_res = "\n".join([r.text for r in res_parts])
577
+
578
+ return create_ser_result(text=text_res, span_source=res_parts)
579
+
580
+
581
+ class HTMLFormSerializer(BaseFormSerializer):
582
+ """HTML-specific form item serializer."""
583
+
584
+ @override
585
+ def serialize(
586
+ self,
587
+ *,
588
+ item: FormItem,
589
+ doc_serializer: "BaseDocSerializer",
590
+ doc: DoclingDocument,
591
+ **kwargs,
592
+ ) -> SerializationResult:
593
+ """Serializes the passed form item to HTML."""
594
+ res_parts: list[SerializationResult] = []
595
+
596
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
597
+ graph_serializer = _HTMLGraphDataSerializer()
598
+
599
+ # Add form if available
600
+ form_res = graph_serializer.serialize(
601
+ item=item,
602
+ graph_data=item.graph,
603
+ class_name="form-container",
604
+ )
605
+ if form_res.text:
606
+ res_parts.append(form_res)
607
+
608
+ # Add caption if available
609
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
610
+ if cap_res.text:
611
+ res_parts.append(cap_res)
612
+
613
+ text_res = "\n".join([r.text for r in res_parts])
614
+
615
+ return create_ser_result(text=text_res, span_source=res_parts)
616
+
617
+
618
+ class HTMLListSerializer(BaseModel, BaseListSerializer):
619
+ """HTML-specific list serializer."""
620
+
621
+ @override
622
+ def serialize(
623
+ self,
624
+ *,
625
+ item: Union[UnorderedList, OrderedList],
626
+ doc_serializer: "BaseDocSerializer",
627
+ doc: DoclingDocument,
628
+ list_level: int = 0,
629
+ is_inline_scope: bool = False,
630
+ visited: Optional[set[str]] = None, # refs of visited items
631
+ **kwargs,
632
+ ) -> SerializationResult:
633
+ """Serializes a list to HTML."""
634
+ my_visited: set[str] = visited if visited is not None else set()
635
+
636
+ # Get all child parts
637
+ parts = doc_serializer.get_parts(
638
+ item=item,
639
+ list_level=list_level + 1,
640
+ is_inline_scope=is_inline_scope,
641
+ visited=my_visited,
642
+ **kwargs,
643
+ )
644
+
645
+ # Add all child parts
646
+ text_res = "\n".join(
647
+ [
648
+ (
649
+ p.text
650
+ if (
651
+ (p.text.startswith("<li>") and p.text.endswith("</li>"))
652
+ or (p.text.startswith("<ol>") and p.text.endswith("</ol>"))
653
+ or (p.text.startswith("<ul>") and p.text.endswith("</ul>"))
654
+ )
655
+ else f"<li>{p.text}</li>"
656
+ )
657
+ for p in parts
658
+ ]
659
+ )
660
+ if text_res:
661
+ tag = "ol" if isinstance(item, OrderedList) else "ul"
662
+ text_res = f"<{tag}>\n{text_res}\n</{tag}>"
663
+
664
+ return create_ser_result(text=text_res, span_source=parts)
665
+
666
+
667
+ class HTMLInlineSerializer(BaseInlineSerializer):
668
+ """HTML-specific inline group serializer."""
669
+
670
+ @override
671
+ def serialize(
672
+ self,
673
+ *,
674
+ item: InlineGroup,
675
+ doc_serializer: "BaseDocSerializer",
676
+ doc: DoclingDocument,
677
+ list_level: int = 0,
678
+ visited: Optional[set[str]] = None, # refs of visited items
679
+ **kwargs,
680
+ ) -> SerializationResult:
681
+ """Serializes an inline group to HTML."""
682
+ my_visited: set[str] = visited if visited is not None else set()
683
+
684
+ # Get all parts with inline scope
685
+ parts = doc_serializer.get_parts(
686
+ item=item,
687
+ list_level=list_level,
688
+ is_inline_scope=True,
689
+ visited=my_visited,
690
+ **kwargs,
691
+ )
692
+
693
+ # Join all parts without separators
694
+ inline_html = " ".join([p.text for p in parts if p.text])
695
+
696
+ # Wrap in span if needed
697
+ if inline_html:
698
+ inline_html = f"<span class='inline-group'>{inline_html}</span>"
699
+
700
+ return create_ser_result(text=inline_html, span_source=parts)
701
+
702
+
703
+ class HTMLFallbackSerializer(BaseFallbackSerializer):
704
+ """HTML-specific fallback serializer."""
705
+
706
+ @override
707
+ def serialize(
708
+ self,
709
+ *,
710
+ item: NodeItem,
711
+ doc_serializer: "BaseDocSerializer",
712
+ doc: DoclingDocument,
713
+ **kwargs,
714
+ ) -> SerializationResult:
715
+ """Fallback serializer for items not handled by other serializers."""
716
+ if isinstance(item, DocItem):
717
+ return create_ser_result(
718
+ text=f"<!-- Unhandled item type: {item.__class__.__name__} -->",
719
+ span_source=item,
720
+ )
721
+ else:
722
+ # For group items, we don't generate any markup
723
+ return create_ser_result()
724
+
725
+
726
+ class HTMLDocSerializer(DocSerializer):
727
+ """HTML-specific document serializer."""
728
+
729
+ text_serializer: BaseTextSerializer = HTMLTextSerializer()
730
+ table_serializer: BaseTableSerializer = HTMLTableSerializer()
731
+ picture_serializer: BasePictureSerializer = HTMLPictureSerializer()
732
+ key_value_serializer: BaseKeyValueSerializer = HTMLKeyValueSerializer()
733
+ form_serializer: BaseFormSerializer = HTMLFormSerializer()
734
+ fallback_serializer: BaseFallbackSerializer = HTMLFallbackSerializer()
735
+
736
+ list_serializer: BaseListSerializer = HTMLListSerializer()
737
+ inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
738
+
739
+ params: HTMLParams = HTMLParams()
740
+
741
+ @override
742
+ def serialize_bold(self, text: str, **kwargs) -> str:
743
+ """Apply HTML-specific bold serialization."""
744
+ return f"<strong>{text}</strong>"
745
+
746
+ @override
747
+ def serialize_italic(self, text: str, **kwargs) -> str:
748
+ """Apply HTML-specific italic serialization."""
749
+ return f"<em>{text}</em>"
750
+
751
+ @override
752
+ def serialize_underline(self, text: str, **kwargs) -> str:
753
+ """Apply HTML-specific underline serialization."""
754
+ return f"<u>{text}</u>"
755
+
756
+ @override
757
+ def serialize_strikethrough(self, text: str, **kwargs) -> str:
758
+ """Apply HTML-specific strikethrough serialization."""
759
+ return f"<del>{text}</del>"
760
+
761
+ @override
762
+ def serialize_hyperlink(
763
+ self, text: str, hyperlink: Union[AnyUrl, Path], **kwargs
764
+ ) -> str:
765
+ """Apply HTML-specific hyperlink serialization."""
766
+ return f'<a href="{str(hyperlink)}">{text}</a>'
767
+
768
+ @override
769
+ def serialize_doc(
770
+ self, parts: list[SerializationResult], **kwargs
771
+ ) -> SerializationResult:
772
+ """Serialize a document out of its pages."""
773
+ # Create HTML structure
774
+ html_parts = [
775
+ "<!DOCTYPE html>",
776
+ "<html>",
777
+ self._generate_head(),
778
+ "<body>",
779
+ ]
780
+
781
+ if self.params.output_style == HTMLOutputStyle.SPLIT_PAGE:
782
+ html_content = "\n".join([p.text for p in parts if p.text])
783
+ next_page: Optional[int] = None
784
+ prev_full_match_end = 0
785
+ pages = {}
786
+ for full_match, prev_page, next_page in self._get_page_breaks(html_content):
787
+ this_match_start = html_content.find(full_match)
788
+ pages[prev_page] = html_content[prev_full_match_end:this_match_start]
789
+ prev_full_match_end = this_match_start + len(full_match)
790
+
791
+ # capture last page
792
+ if next_page is not None:
793
+ pages[next_page] = html_content[prev_full_match_end:]
794
+
795
+ html_parts.append("<table>")
796
+ html_parts.append("<tbody>")
797
+
798
+ applicable_pages = self._get_applicable_pages()
799
+ for page_no, page in pages.items():
800
+
801
+ if isinstance(page_no, int):
802
+ if applicable_pages is not None and page_no not in applicable_pages:
803
+ continue
804
+ page_img = self.doc.pages[page_no].image
805
+
806
+ html_parts.append("<tr>")
807
+
808
+ html_parts.append("<td>")
809
+
810
+ # short-cut: we already have the image in base64
811
+ if (
812
+ (page_img is not None)
813
+ and isinstance(page_img, ImageRef)
814
+ and isinstance(page_img.uri, AnyUrl)
815
+ and page_img.uri.scheme == "data"
816
+ ):
817
+ img_text = f'<img src="{page_img.uri}">'
818
+ html_parts.append(f"<figure>{img_text}</figure>")
819
+
820
+ elif (page_img is not None) and (page_img._pil is not None):
821
+
822
+ buffered = BytesIO()
823
+ page_img._pil.save(
824
+ buffered, format="PNG"
825
+ ) # Save the image to the byte stream
826
+ img_bytes = buffered.getvalue() # Get the byte data
827
+
828
+ # Encode to Base64 and decode to string
829
+ img_base64 = base64.b64encode(img_bytes).decode("utf-8")
830
+ img_text = f'<img src="data:image/png;base64,{img_base64}">'
831
+
832
+ html_parts.append(f"<figure>{img_text}</figure>")
833
+ else:
834
+ html_parts.append("<figure>no page-image found</figure>")
835
+
836
+ html_parts.append("</td>")
837
+
838
+ html_parts.append("<td>")
839
+ html_parts.append(f"<div class='page'>\n{page}\n</div>")
840
+ html_parts.append("</td>")
841
+
842
+ html_parts.append("</tr>")
843
+ else:
844
+ raise ValueError(
845
+ "We need page-indices to leverage `split_page_view`"
846
+ )
847
+
848
+ html_parts.append("</tbody>")
849
+ html_parts.append("</table>")
850
+
851
+ elif self.params.output_style == HTMLOutputStyle.SINGLE_COLUMN:
852
+ # Add all pages
853
+ html_content = "\n".join([p.text for p in parts if p.text])
854
+ html_content = f"<div class='page'>\n{html_content}\n</div>"
855
+ html_parts.append(html_content)
856
+ else:
857
+ raise ValueError(f"unknown output-style: {self.params.output_style}")
858
+
859
+ # Close HTML structure
860
+ html_parts.extend(["</body>", "</html>"])
861
+
862
+ # Join with newlines
863
+ html_content = "\n".join(html_parts)
864
+
865
+ return create_ser_result(text=html_content, span_source=parts)
866
+
867
+ @override
868
+ def serialize_captions(
869
+ self,
870
+ item: FloatingItem,
871
+ tag: str = "figcaption",
872
+ **kwargs,
873
+ ) -> SerializationResult:
874
+ """Serialize the item's captions."""
875
+ params = self.params.merge_with_patch(patch=kwargs)
876
+ results: list[SerializationResult] = []
877
+ text_res = ""
878
+ if DocItemLabel.CAPTION in params.labels:
879
+ results = [
880
+ create_ser_result(text=it.text, span_source=it)
881
+ for cap in item.captions
882
+ if isinstance(it := cap.resolve(self.doc), TextItem)
883
+ and it.self_ref not in self.get_excluded_refs(**kwargs)
884
+ ]
885
+ text_res = params.caption_delim.join([r.text for r in results])
886
+ if text_res:
887
+ text_dir = get_text_direction(text_res)
888
+ dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
889
+ text_res = f"<{tag}{dir_str}>{html.escape(text_res)}</{tag}>"
890
+ return create_ser_result(text=text_res, span_source=results)
891
+
892
+ def _generate_head(self) -> str:
893
+ """Generate the HTML head section with metadata and styles."""
894
+ params = self.params
895
+
896
+ if self.params.html_head is not None:
897
+ return self.params.html_head
898
+
899
+ head_parts = ["<head>", '<meta charset="UTF-8">']
900
+
901
+ # Add metadata if requested
902
+ if params.add_document_metadata:
903
+ if self.doc.name:
904
+ head_parts.append(f"<title>{html.escape(self.doc.name)}</title>")
905
+ else:
906
+ head_parts.append("<title>Docling Document</title>")
907
+
908
+ head_parts.append(
909
+ '<meta name="generator" content="Docling HTML Serializer">'
910
+ )
911
+
912
+ # Add default styles or custom CSS
913
+ if params.css_styles:
914
+ if params.css_styles.startswith("<style>") and params.css_styles.endswith(
915
+ "</style>"
916
+ ):
917
+ head_parts.append(f"\n{params.css_styles}\n")
918
+ else:
919
+ head_parts.append(f"<style>\n{params.css_styles}\n</style>")
920
+ elif self.params.output_style == HTMLOutputStyle.SPLIT_PAGE:
921
+ head_parts.append(_get_css_for_split_page())
922
+ elif self.params.output_style == HTMLOutputStyle.SINGLE_COLUMN:
923
+ head_parts.append(_get_css_for_single_column())
924
+ else:
925
+ raise ValueError(f"unknown output-style: {self.params.output_style}")
926
+
927
+ head_parts.append("</head>")
928
+
929
+ if params.prettify:
930
+ return "\n".join(head_parts)
931
+ else:
932
+ return "".join(head_parts)
933
+
934
+ def _get_default_css(self) -> str:
935
+ """Return default CSS styles for the HTML document."""
936
+ return "<style></style>"
937
+
938
+ @override
939
+ def requires_page_break(self):
940
+ """Whether to add page breaks."""
941
+ return self.params.output_style == HTMLOutputStyle.SPLIT_PAGE