docling-core 2.24.1__py3-none-any.whl → 2.26.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -3,7 +3,6 @@
3
3
  import base64
4
4
  import copy
5
5
  import hashlib
6
- import html
7
6
  import itertools
8
7
  import json
9
8
  import logging
@@ -12,17 +11,12 @@ import os
12
11
  import re
13
12
  import sys
14
13
  import typing
15
- import warnings
16
14
  from enum import Enum
17
15
  from io import BytesIO
18
16
  from pathlib import Path
19
17
  from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
20
- from urllib.parse import quote, unquote
21
- from xml.etree.cElementTree import SubElement, tostring
22
- from xml.sax.saxutils import unescape
18
+ from urllib.parse import unquote
23
19
 
24
- import latex2mathml.converter
25
- import latex2mathml.exceptions
26
20
  import pandas as pd
27
21
  import yaml
28
22
  from PIL import Image as PILImage
@@ -49,13 +43,10 @@ from docling_core.types.doc.labels import (
49
43
  GraphCellLabel,
50
44
  GraphLinkLabel,
51
45
  GroupLabel,
46
+ PictureClassificationLabel,
52
47
  )
53
48
  from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
54
- from docling_core.types.doc.utils import (
55
- get_html_tag_with_text_direction,
56
- get_text_direction,
57
- relative_path,
58
- )
49
+ from docling_core.types.doc.utils import relative_path
59
50
 
60
51
  _logger = logging.getLogger(__name__)
61
52
 
@@ -290,22 +281,6 @@ class PictureScatterChartData(PictureChartData):
290
281
  points: List[ChartPoint]
291
282
 
292
283
 
293
- PictureDataType = Annotated[
294
- Union[
295
- PictureClassificationData,
296
- PictureDescriptionData,
297
- PictureMoleculeData,
298
- PictureMiscData,
299
- PictureLineChartData,
300
- PictureBarChartData,
301
- PictureStackedBarChartData,
302
- PicturePieChartData,
303
- PictureScatterChartData,
304
- ],
305
- Field(discriminator="kind"),
306
- ]
307
-
308
-
309
284
  class TableCell(BaseModel):
310
285
  """TableCell."""
311
286
 
@@ -391,6 +366,35 @@ class TableData(BaseModel): # TBD
391
366
  return table_data
392
367
 
393
368
 
369
+ class PictureTabularChartData(PictureChartData):
370
+ """Base class for picture chart data.
371
+
372
+ Attributes:
373
+ title (str): The title of the chart.
374
+ chart_data (TableData): Chart data in the table format.
375
+ """
376
+
377
+ kind: Literal["tabular_chart_data"] = "tabular_chart_data"
378
+ chart_data: TableData
379
+
380
+
381
+ PictureDataType = Annotated[
382
+ Union[
383
+ PictureClassificationData,
384
+ PictureDescriptionData,
385
+ PictureMoleculeData,
386
+ PictureMiscData,
387
+ PictureTabularChartData,
388
+ PictureLineChartData,
389
+ PictureBarChartData,
390
+ PictureStackedBarChartData,
391
+ PicturePieChartData,
392
+ PictureScatterChartData,
393
+ ],
394
+ Field(discriminator="kind"),
395
+ ]
396
+
397
+
394
398
  class DocumentOrigin(BaseModel):
395
399
  """FileSource."""
396
400
 
@@ -458,8 +462,12 @@ class RefItem(BaseModel):
458
462
  populate_by_name=True,
459
463
  )
460
464
 
465
+ def _split_ref_to_path(self):
466
+ """Get the path of the reference."""
467
+ return self.cref.split("/")
468
+
461
469
  def resolve(self, doc: "DoclingDocument"):
462
- """resolve."""
470
+ """Resolve the path in the document."""
463
471
  path_components = self.cref.split("/")
464
472
  if (num_comps := len(path_components)) == 3:
465
473
  _, path, index_str = path_components
@@ -542,25 +550,32 @@ class DocTagsDocument(BaseModel):
542
550
 
543
551
  @classmethod
544
552
  def from_doctags_and_image_pairs(
545
- cls, doctags: List[Union[Path, str]], images: List[Union[Path, PILImage.Image]]
553
+ cls,
554
+ doctags: typing.Sequence[Union[Path, str]],
555
+ images: Optional[List[Union[Path, PILImage.Image]]],
546
556
  ):
547
557
  """from_doctags_and_image_pairs."""
548
- if len(doctags) != len(images):
558
+ if images is not None and len(doctags) != len(images):
549
559
  raise ValueError("Number of page doctags must be equal to page images!")
550
560
  doctags_doc = cls()
551
561
 
552
562
  pages = []
553
- for dt, img in zip(doctags, images):
563
+
564
+ for ix, dt in enumerate(doctags):
554
565
  if isinstance(dt, Path):
555
566
  with dt.open("r") as fp:
556
567
  dt = fp.read()
557
568
  elif isinstance(dt, str):
558
569
  pass
559
570
 
560
- if isinstance(img, Path):
561
- img = PILImage.open(img)
562
- elif isinstance(dt, PILImage.Image):
563
- pass
571
+ img = None
572
+ if images is not None:
573
+ img = images[ix]
574
+
575
+ if isinstance(img, Path):
576
+ img = PILImage.open(img)
577
+ elif isinstance(img, PILImage.Image):
578
+ pass
564
579
 
565
580
  page = DocTagsPage(tokens=dt, image=img)
566
581
  pages.append(page)
@@ -568,6 +583,25 @@ class DocTagsDocument(BaseModel):
568
583
  doctags_doc.pages = pages
569
584
  return doctags_doc
570
585
 
586
+ @classmethod
587
+ def from_multipage_doctags_and_images(
588
+ cls,
589
+ doctags: Union[Path, str],
590
+ images: Optional[List[Union[Path, PILImage.Image]]],
591
+ ):
592
+ """From doctags with `<page_break>` and corresponding list of page images."""
593
+ if isinstance(doctags, Path):
594
+ with doctags.open("r") as fp:
595
+ doctags = fp.read()
596
+ dt_list = (
597
+ doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>")
598
+ .removesuffix(f"</{DocumentToken.DOCUMENT.value}>")
599
+ .split(f"<{DocumentToken.PAGE_BREAK.value}>")
600
+ )
601
+ dt_list = [el.strip() for el in dt_list]
602
+
603
+ return cls.from_doctags_and_image_pairs(dt_list, images)
604
+
571
605
 
572
606
  class ProvenanceItem(BaseModel):
573
607
  """ProvenanceItem."""
@@ -598,10 +632,98 @@ class NodeItem(BaseModel):
598
632
 
599
633
  model_config = ConfigDict(extra="forbid")
600
634
 
601
- def get_ref(self):
635
+ def get_ref(self) -> RefItem:
602
636
  """get_ref."""
603
637
  return RefItem(cref=self.self_ref)
604
638
 
639
+ def _get_parent_ref(
640
+ self, doc: "DoclingDocument", stack: list[int]
641
+ ) -> Optional[RefItem]:
642
+ """get_parent_ref."""
643
+ if len(stack) == 0:
644
+ return self.parent
645
+ elif len(stack) > 0 and stack[0] < len(self.children):
646
+ item = self.children[stack[0]].resolve(doc)
647
+ return item._get_parent_ref(doc=doc, stack=stack[1:])
648
+
649
+ return None
650
+
651
+ def _delete_child(self, doc: "DoclingDocument", stack: list[int]) -> bool:
652
+ """Delete child node in tree."""
653
+ if len(stack) == 1 and stack[0] < len(self.children):
654
+ del self.children[stack[0]]
655
+ return True
656
+ elif len(stack) > 1 and stack[0] < len(self.children):
657
+ item = self.children[stack[0]].resolve(doc)
658
+ return item._delete_child(doc=doc, stack=stack[1:])
659
+
660
+ return False
661
+
662
+ def _update_child(
663
+ self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem
664
+ ) -> bool:
665
+ """Update child node in tree."""
666
+ if len(stack) == 1 and stack[0] < len(self.children):
667
+ # ensure the parent is correct
668
+ new_item = new_ref.resolve(doc=doc)
669
+ new_item.parent = self.get_ref()
670
+
671
+ self.children[stack[0]] = new_ref
672
+ return True
673
+ elif len(stack) > 1 and stack[0] < len(self.children):
674
+ item = self.children[stack[0]].resolve(doc)
675
+ return item._update_child(doc=doc, stack=stack[1:], new_ref=new_ref)
676
+
677
+ return False
678
+
679
+ def _add_child(
680
+ self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem
681
+ ) -> bool:
682
+ """Append child to node identified by stack."""
683
+ if len(stack) == 0:
684
+
685
+ # ensure the parent is correct
686
+ new_item = new_ref.resolve(doc=doc)
687
+ new_item.parent = self.get_ref()
688
+
689
+ self.children.append(new_ref)
690
+ return True
691
+ elif len(stack) > 0 and stack[0] < len(self.children):
692
+ item = self.children[stack[0]].resolve(doc)
693
+ return item._add_child(doc=doc, stack=stack[1:], new_ref=new_ref)
694
+
695
+ return False
696
+
697
+ def _add_sibling(
698
+ self,
699
+ doc: "DoclingDocument",
700
+ stack: list[int],
701
+ new_ref: RefItem,
702
+ after: bool = True,
703
+ ) -> bool:
704
+ """Add sibling node in tree."""
705
+ if len(stack) == 1 and stack[0] < len(self.children) and (not after):
706
+ # ensure the parent is correct
707
+ new_item = new_ref.resolve(doc=doc)
708
+ new_item.parent = self.get_ref()
709
+
710
+ self.children.insert(stack[0], new_ref)
711
+ return True
712
+ elif len(stack) == 1 and stack[0] < len(self.children) and (after):
713
+ # ensure the parent is correct
714
+ new_item = new_ref.resolve(doc=doc)
715
+ new_item.parent = self.get_ref()
716
+
717
+ self.children.insert(stack[0] + 1, new_ref)
718
+ return True
719
+ elif len(stack) > 1 and stack[0] < len(self.children):
720
+ item = self.children[stack[0]].resolve(doc)
721
+ return item._add_sibling(
722
+ doc=doc, stack=stack[1:], new_ref=new_ref, after=after
723
+ )
724
+
725
+ return False
726
+
605
727
 
606
728
  class GroupItem(NodeItem): # Container type, can't be a leaf node
607
729
  """GroupItem."""
@@ -722,7 +844,9 @@ class TextItem(DocItem):
722
844
  text: str # sanitized representation
723
845
 
724
846
  formatting: Optional[Formatting] = None
725
- hyperlink: Optional[Union[AnyUrl, Path]] = None
847
+ hyperlink: Optional[Union[AnyUrl, Path]] = Field(
848
+ union_mode="left_to_right", default=None
849
+ )
726
850
 
727
851
  @deprecated("Use export_to_doctags() instead.")
728
852
  def export_to_document_tokens(self, *args, **kwargs):
@@ -925,7 +1049,9 @@ class FormulaItem(TextItem):
925
1049
  class PictureItem(FloatingItem):
926
1050
  """PictureItem."""
927
1051
 
928
- label: typing.Literal[DocItemLabel.PICTURE] = DocItemLabel.PICTURE
1052
+ label: typing.Literal[DocItemLabel.PICTURE, DocItemLabel.CHART] = (
1053
+ DocItemLabel.PICTURE
1054
+ )
929
1055
 
930
1056
  annotations: List[PictureDataType] = []
931
1057
 
@@ -992,54 +1118,19 @@ class PictureItem(FloatingItem):
992
1118
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
993
1119
  ) -> str:
994
1120
  """Export picture to HTML format."""
995
- text = ""
996
- if add_caption and len(self.captions):
997
- text = self.caption_text(doc)
998
-
999
- caption_text = ""
1000
- if len(text) > 0:
1001
- caption_text = get_html_tag_with_text_direction(
1002
- html_tag="figcaption", text=text
1003
- )
1004
-
1005
- default_response = f"<figure>{caption_text}</figure>"
1006
-
1007
- if image_mode == ImageRefMode.PLACEHOLDER:
1008
- return default_response
1009
-
1010
- elif image_mode == ImageRefMode.EMBEDDED:
1011
- # short-cut: we already have the image in base64
1012
- if (
1013
- isinstance(self.image, ImageRef)
1014
- and isinstance(self.image.uri, AnyUrl)
1015
- and self.image.uri.scheme == "data"
1016
- ):
1017
- img_text = f'<img src="{self.image.uri}">'
1018
- return f"<figure>{caption_text}{img_text}</figure>"
1019
-
1020
- # get the self.image._pil or crop it out of the page-image
1021
- img = self.get_image(doc)
1022
-
1023
- if img is not None:
1024
- imgb64 = self._image_to_base64(img)
1025
- img_text = f'<img src="data:image/png;base64,{imgb64}">'
1026
-
1027
- return f"<figure>{caption_text}{img_text}</figure>"
1028
- else:
1029
- return default_response
1030
-
1031
- elif image_mode == ImageRefMode.REFERENCED:
1032
-
1033
- if not isinstance(self.image, ImageRef) or (
1034
- isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
1035
- ):
1036
- return default_response
1037
-
1038
- img_text = f'<img src="{quote(str(self.image.uri))}">'
1039
- return f"<figure>{caption_text}{img_text}</figure>"
1121
+ from docling_core.experimental.serializer.html import (
1122
+ HTMLDocSerializer,
1123
+ HTMLParams,
1124
+ )
1040
1125
 
1041
- else:
1042
- return default_response
1126
+ serializer = HTMLDocSerializer(
1127
+ doc=doc,
1128
+ params=HTMLParams(
1129
+ image_mode=image_mode,
1130
+ ),
1131
+ )
1132
+ text = serializer.serialize(item=self).text
1133
+ return text
1043
1134
 
1044
1135
  @deprecated("Use export_to_doctags() instead.")
1045
1136
  def export_to_document_tokens(self, *args, **kwargs):
@@ -1190,81 +1281,18 @@ class TableItem(FloatingItem):
1190
1281
  add_caption: bool = True,
1191
1282
  ) -> str:
1192
1283
  """Export the table as html."""
1193
- if doc is None:
1194
- warnings.warn(
1195
- "The `doc` argument will be mandatory in a future version. "
1196
- "It must be provided to include a caption.",
1197
- DeprecationWarning,
1198
- )
1199
-
1200
- nrows = self.data.num_rows
1201
- ncols = self.data.num_cols
1202
-
1203
- text = ""
1204
- if doc is not None and add_caption and len(self.captions):
1205
- text = html.escape(self.caption_text(doc))
1206
-
1207
- if len(self.data.table_cells) == 0:
1208
- return ""
1209
-
1210
- body = ""
1211
-
1212
- for i in range(nrows):
1213
- body += "<tr>"
1214
- for j in range(ncols):
1215
- cell: TableCell = self.data.grid[i][j]
1216
-
1217
- rowspan, rowstart = (
1218
- cell.row_span,
1219
- cell.start_row_offset_idx,
1220
- )
1221
- colspan, colstart = (
1222
- cell.col_span,
1223
- cell.start_col_offset_idx,
1224
- )
1225
-
1226
- if rowstart != i:
1227
- continue
1228
- if colstart != j:
1229
- continue
1230
-
1231
- content = html.escape(cell.text.strip())
1232
- celltag = "td"
1233
- if cell.column_header:
1234
- celltag = "th"
1235
-
1236
- opening_tag = f"{celltag}"
1237
- if rowspan > 1:
1238
- opening_tag += f' rowspan="{rowspan}"'
1239
- if colspan > 1:
1240
- opening_tag += f' colspan="{colspan}"'
1241
-
1242
- text_dir = get_text_direction(content)
1243
- if text_dir == "rtl":
1244
- opening_tag += f' dir="{dir}"'
1245
-
1246
- body += f"<{opening_tag}>{content}</{celltag}>"
1247
- body += "</tr>"
1248
-
1249
- # dir = get_text_direction(text)
1250
-
1251
- if len(text) > 0 and len(body) > 0:
1252
- caption_text = get_html_tag_with_text_direction(
1253
- html_tag="caption", text=text
1254
- )
1255
- body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
1284
+ if doc is not None:
1285
+ from docling_core.experimental.serializer.html import HTMLDocSerializer
1256
1286
 
1257
- elif len(text) == 0 and len(body) > 0:
1258
- body = f"<table><tbody>{body}</tbody></table>"
1259
- elif len(text) > 0 and len(body) == 0:
1260
- caption_text = get_html_tag_with_text_direction(
1261
- html_tag="caption", text=text
1262
- )
1263
- body = f"<table>{caption_text}</table>"
1287
+ serializer = HTMLDocSerializer(doc=doc)
1288
+ text = serializer.serialize(item=self).text
1289
+ return text
1264
1290
  else:
1265
- body = "<table></table>"
1266
-
1267
- return body
1291
+ _logger.error(
1292
+ "Usage of TableItem.export_to_html() without `doc` argument is "
1293
+ "deprecated.",
1294
+ )
1295
+ return ""
1268
1296
 
1269
1297
  def export_to_otsl(
1270
1298
  self,
@@ -1539,76 +1567,6 @@ class PageItem(BaseModel):
1539
1567
  class DoclingDocument(BaseModel):
1540
1568
  """DoclingDocument."""
1541
1569
 
1542
- _HTML_DEFAULT_HEAD: str = r"""<head>
1543
- <link rel="icon" type="image/png"
1544
- href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
1545
- <meta charset="UTF-8">
1546
- <title>
1547
- Powered by Docling
1548
- </title>
1549
- <style>
1550
- html {
1551
- background-color: LightGray;
1552
- }
1553
- body {
1554
- margin: 0 auto;
1555
- width:800px;
1556
- padding: 30px;
1557
- background-color: White;
1558
- font-family: Arial, sans-serif;
1559
- box-shadow: 10px 10px 10px grey;
1560
- }
1561
- figure{
1562
- display: block;
1563
- width: 100%;
1564
- margin: 0px;
1565
- margin-top: 10px;
1566
- margin-bottom: 10px;
1567
- }
1568
- img {
1569
- display: block;
1570
- margin: auto;
1571
- margin-top: 10px;
1572
- margin-bottom: 10px;
1573
- max-width: 640px;
1574
- max-height: 640px;
1575
- }
1576
- table {
1577
- min-width:500px;
1578
- background-color: White;
1579
- border-collapse: collapse;
1580
- cell-padding: 5px;
1581
- margin: auto;
1582
- margin-top: 10px;
1583
- margin-bottom: 10px;
1584
- }
1585
- th, td {
1586
- border: 1px solid black;
1587
- padding: 8px;
1588
- }
1589
- th {
1590
- font-weight: bold;
1591
- }
1592
- table tr:nth-child(even) td{
1593
- background-color: LightGray;
1594
- }
1595
- math annotation {
1596
- display: none;
1597
- }
1598
- .formula-not-decoded {
1599
- background: repeating-linear-gradient(
1600
- 45deg, /* Angle of the stripes */
1601
- LightGray, /* First color */
1602
- LightGray 10px, /* Length of the first color */
1603
- White 10px, /* Second color */
1604
- White 20px /* Length of the second color */
1605
- );
1606
- margin: 0;
1607
- text-align: center;
1608
- }
1609
- </style>
1610
- </head>"""
1611
-
1612
1570
  schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
1613
1571
  version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
1614
1572
  CURRENT_VERSION
@@ -1655,6 +1613,364 @@ class DoclingDocument(BaseModel):
1655
1613
  item["content_layer"] = "furniture"
1656
1614
  return data
1657
1615
 
1616
+ # ---------------------------
1617
+ # Public Manipulation methods
1618
+ # ---------------------------
1619
+
1620
+ def append_child_item(
1621
+ self, *, child: NodeItem, parent: Optional[NodeItem] = None
1622
+ ) -> None:
1623
+ """Adds an item."""
1624
+ if len(child.children) > 0:
1625
+ raise ValueError("Can not append a child with children")
1626
+
1627
+ parent = parent if parent is not None else self.body
1628
+
1629
+ success, stack = self._get_stack_of_item(item=parent)
1630
+
1631
+ if not success:
1632
+ raise ValueError(
1633
+ f"Could not resolve the parent node in the document tree: {parent}"
1634
+ )
1635
+
1636
+ # Append the item to the attributes of the doc
1637
+ self._append_item(item=child, parent_ref=parent.get_ref())
1638
+
1639
+ # Update the tree of the doc
1640
+ success = self.body._add_child(doc=self, new_ref=child.get_ref(), stack=stack)
1641
+
1642
+ # Clean the attribute (orphan) if not successful
1643
+ if not success:
1644
+ self._pop_item(item=child)
1645
+ raise ValueError(f"Could not append child: {child} to parent: {parent}")
1646
+
1647
+ def insert_item_after_sibling(
1648
+ self, *, new_item: NodeItem, sibling: NodeItem
1649
+ ) -> None:
1650
+ """Inserts an item, given its node_item instance, after other as a sibling."""
1651
+ self._insert_item_at_refitem(item=new_item, ref=sibling.get_ref(), after=True)
1652
+
1653
+ def insert_item_before_sibling(
1654
+ self, *, new_item: NodeItem, sibling: NodeItem
1655
+ ) -> None:
1656
+ """Inserts an item, given its node_item instance, before other as a sibling."""
1657
+ self._insert_item_at_refitem(item=new_item, ref=sibling.get_ref(), after=False)
1658
+
1659
+ def delete_items(self, *, node_items: List[NodeItem]) -> None:
1660
+ """Deletes an item, given its instance or ref, and any children it has."""
1661
+ refs = []
1662
+ for _ in node_items:
1663
+ refs.append(_.get_ref())
1664
+
1665
+ self._delete_items(refs=refs)
1666
+
1667
+ def replace_item(self, *, new_item: NodeItem, old_item: NodeItem) -> None:
1668
+ """Replace item with new item."""
1669
+ self.insert_item_after_sibling(new_item=new_item, sibling=old_item)
1670
+ self.delete_items(node_items=[old_item])
1671
+
1672
+ # ----------------------------
1673
+ # Private Manipulation methods
1674
+ # ----------------------------
1675
+
1676
+ def _get_stack_of_item(self, item: NodeItem) -> tuple[bool, list[int]]:
1677
+ """Find the stack indices of the item."""
1678
+ return self._get_stack_of_refitem(ref=item.get_ref())
1679
+
1680
+ def _get_stack_of_refitem(self, ref: RefItem) -> tuple[bool, list[int]]:
1681
+ """Find the stack indices of the reference."""
1682
+ if ref == self.body.get_ref():
1683
+ return (True, [])
1684
+
1685
+ node = ref.resolve(doc=self)
1686
+ parent_ref = node._get_parent_ref(doc=self, stack=[])
1687
+
1688
+ if parent_ref is None:
1689
+ return (False, [])
1690
+
1691
+ stack: list[int] = []
1692
+ while parent_ref is not None:
1693
+ parent = parent_ref.resolve(doc=self)
1694
+
1695
+ index = parent.children.index(node.get_ref())
1696
+ stack.insert(0, index) # prepend the index
1697
+
1698
+ node = parent
1699
+ parent_ref = node._get_parent_ref(doc=self, stack=[])
1700
+
1701
+ return (True, stack)
1702
+
1703
+ def _insert_item_at_refitem(
1704
+ self, item: NodeItem, ref: RefItem, after: bool
1705
+ ) -> RefItem:
1706
+ """Insert node-item using the self-reference."""
1707
+ success, stack = self._get_stack_of_refitem(ref=ref)
1708
+
1709
+ if not success:
1710
+ raise ValueError(
1711
+ f"Could not insert at {ref.cref}: could not find the stack"
1712
+ )
1713
+
1714
+ return self._insert_item_at_stack(item=item, stack=stack, after=after)
1715
+
1716
+ def _append_item(self, *, item: NodeItem, parent_ref: RefItem) -> RefItem:
1717
+ """Append item of its type."""
1718
+ cref: str = "" # to be updated
1719
+
1720
+ if isinstance(item, TextItem):
1721
+ item_label = "texts"
1722
+ item_index = len(self.texts)
1723
+
1724
+ cref = f"#/{item_label}/{item_index}"
1725
+
1726
+ item.self_ref = cref
1727
+ item.parent = parent_ref
1728
+
1729
+ self.texts.append(item)
1730
+
1731
+ elif isinstance(item, TableItem):
1732
+ item_label = "tables"
1733
+ item_index = len(self.tables)
1734
+
1735
+ cref = f"#/{item_label}/{item_index}"
1736
+
1737
+ item.self_ref = cref
1738
+ item.parent = parent_ref
1739
+
1740
+ self.tables.append(item)
1741
+
1742
+ elif isinstance(item, PictureItem):
1743
+ item_label = "pictures"
1744
+ item_index = len(self.pictures)
1745
+
1746
+ cref = f"#/{item_label}/{item_index}"
1747
+
1748
+ item.self_ref = cref
1749
+ item.parent = parent_ref
1750
+
1751
+ self.pictures.append(item)
1752
+
1753
+ elif isinstance(item, KeyValueItem):
1754
+ item_label = "key_value_items"
1755
+ item_index = len(self.key_value_items)
1756
+
1757
+ cref = f"#/{item_label}/{item_index}"
1758
+
1759
+ item.self_ref = cref
1760
+ item.parent = parent_ref
1761
+
1762
+ self.key_value_items.append(item)
1763
+
1764
+ elif isinstance(item, FormItem):
1765
+ item_label = "form_items"
1766
+ item_index = len(self.form_items)
1767
+
1768
+ cref = f"#/{item_label}/{item_index}"
1769
+
1770
+ item.self_ref = cref
1771
+ item.parent = parent_ref
1772
+
1773
+ self.form_items.append(item)
1774
+ else:
1775
+ raise ValueError(f"Item {item} is not supported for insertion")
1776
+
1777
+ return RefItem(cref=cref)
1778
+
1779
+ def _pop_item(self, *, item: NodeItem):
1780
+ """Pop the last item of its type."""
1781
+ path = item.self_ref.split("/")
1782
+
1783
+ if len(path) != 3:
1784
+ raise ValueError(f"Can not pop item with path: {path}")
1785
+
1786
+ item_label = path[1]
1787
+ item_index = int(path[2])
1788
+
1789
+ if (
1790
+ len(self.__getattribute__(item_label)) + 1 == item_index
1791
+ ): # we can only pop the last item
1792
+ del self.__getattribute__(item_label)[item_index]
1793
+ else:
1794
+ msg = f"index:{item_index}, len:{len(self.__getattribute__(item_label))}"
1795
+ raise ValueError(f"Failed to pop: item is not last ({msg})")
1796
+
1797
+ def _insert_item_at_stack(
1798
+ self, item: NodeItem, stack: list[int], after: bool
1799
+ ) -> RefItem:
1800
+ """Insert node-item using the self-reference."""
1801
+ parent_ref = self.body._get_parent_ref(doc=self, stack=stack)
1802
+
1803
+ if parent_ref is None:
1804
+ raise ValueError(f"Could not find a parent at stack: {stack}")
1805
+
1806
+ new_ref = self._append_item(item=item, parent_ref=parent_ref)
1807
+
1808
+ success = self.body._add_sibling(
1809
+ doc=self, stack=stack, new_ref=new_ref, after=after
1810
+ )
1811
+
1812
+ if not success:
1813
+ self._pop_item(item=item)
1814
+
1815
+ return item.get_ref()
1816
+
1817
+ def _delete_items(self, refs: list[RefItem]) -> bool:
1818
+ """Delete document item using the self-reference."""
1819
+ to_be_deleted_items: dict[tuple[int, ...], str] = {} # stack to cref
1820
+
1821
+ # Identify the to_be_deleted_items
1822
+ for item, stack in self._iterate_items_with_stack(with_groups=True):
1823
+ ref = item.get_ref()
1824
+
1825
+ if ref in refs:
1826
+ to_be_deleted_items[tuple(stack)] = ref.cref
1827
+
1828
+ substacks = [stack[0 : i + 1] for i in range(len(stack) - 1)]
1829
+ for substack in substacks:
1830
+ if tuple(substack) in to_be_deleted_items:
1831
+ to_be_deleted_items[tuple(stack)] = ref.cref
1832
+
1833
+ if len(to_be_deleted_items) == 0:
1834
+ raise ValueError("Nothing to be deleted ...")
1835
+
1836
+ # Clean the tree, reverse the order to not have to update
1837
+ for stack_, ref_ in reversed(sorted(to_be_deleted_items.items())):
1838
+ success = self.body._delete_child(doc=self, stack=list(stack_))
1839
+
1840
+ if not success:
1841
+ del to_be_deleted_items[stack_]
1842
+ else:
1843
+ _logger.info(f"deleted item in tree at stack: {stack_} => {ref_}")
1844
+
1845
+ # Create a new lookup of the orphans:
1846
+ # dict of item_label (`texts`, `tables`, ...) to a
1847
+ # dict of item_label with delta (default = -1).
1848
+ lookup: dict[str, dict[int, int]] = {}
1849
+
1850
+ for stack_, ref_ in to_be_deleted_items.items():
1851
+ path = ref_.split("/")
1852
+ if len(path) == 3:
1853
+
1854
+ item_label = path[1]
1855
+ item_index = int(path[2])
1856
+
1857
+ if item_label not in lookup:
1858
+ lookup[item_label] = {}
1859
+
1860
+ lookup[item_label][item_index] = -1
1861
+
1862
+ # Remove the orphans in reverse order
1863
+ for item_label, item_inds in lookup.items():
1864
+ for item_index, val in reversed(
1865
+ sorted(item_inds.items())
1866
+ ): # make sure you delete the last in the list first!
1867
+ _logger.debug(f"deleting item in doc for {item_label} for {item_index}")
1868
+ del self.__getattribute__(item_label)[item_index]
1869
+
1870
+ self._update_breadth_first_with_lookup(
1871
+ node=self.body, refs_to_be_deleted=refs, lookup=lookup
1872
+ )
1873
+
1874
+ return True
1875
+
1876
+ # Update the references
1877
+ def _update_ref_with_lookup(
1878
+ self, item_label: str, item_index: int, lookup: dict[str, dict[int, int]]
1879
+ ) -> RefItem:
1880
+ """Update ref with lookup."""
1881
+ if item_label not in lookup: # Nothing to be done
1882
+ return RefItem(cref=f"#/{item_label}/{item_index}")
1883
+
1884
+ # Count how many items have been deleted in front of you
1885
+ delta = sum(
1886
+ val if item_index >= key else 0 for key, val in lookup[item_label].items()
1887
+ )
1888
+ new_index = item_index + delta
1889
+
1890
+ return RefItem(cref=f"#/{item_label}/{new_index}")
1891
+
1892
+ def _update_refitems_with_lookup(
1893
+ self,
1894
+ ref_items: list[RefItem],
1895
+ refs_to_be_deleted: list[RefItem],
1896
+ lookup: dict[str, dict[int, int]],
1897
+ ) -> list[RefItem]:
1898
+ """Update refitems with lookup."""
1899
+ new_refitems = []
1900
+ for ref_item in ref_items:
1901
+
1902
+ if (
1903
+ ref_item not in refs_to_be_deleted
1904
+ ): # if ref_item is in ref, then delete/skip them
1905
+ path = ref_item._split_ref_to_path()
1906
+ if len(path) == 3:
1907
+ new_refitems.append(
1908
+ self._update_ref_with_lookup(
1909
+ item_label=path[1],
1910
+ item_index=int(path[2]),
1911
+ lookup=lookup,
1912
+ )
1913
+ )
1914
+ else:
1915
+ new_refitems.append(ref_item)
1916
+
1917
+ return new_refitems
1918
+
1919
+ def _update_breadth_first_with_lookup(
1920
+ self,
1921
+ node: NodeItem,
1922
+ refs_to_be_deleted: list[RefItem],
1923
+ lookup: dict[str, dict[int, int]],
1924
+ ):
1925
+ """Update breadth first with lookup."""
1926
+ # Update the captions, references and footnote references
1927
+ if isinstance(node, FloatingItem):
1928
+ node.captions = self._update_refitems_with_lookup(
1929
+ ref_items=node.captions,
1930
+ refs_to_be_deleted=refs_to_be_deleted,
1931
+ lookup=lookup,
1932
+ )
1933
+ node.references = self._update_refitems_with_lookup(
1934
+ ref_items=node.references,
1935
+ refs_to_be_deleted=refs_to_be_deleted,
1936
+ lookup=lookup,
1937
+ )
1938
+ node.footnotes = self._update_refitems_with_lookup(
1939
+ ref_items=node.footnotes,
1940
+ refs_to_be_deleted=refs_to_be_deleted,
1941
+ lookup=lookup,
1942
+ )
1943
+
1944
+ # Update the self_ref reference
1945
+ if node.parent is not None:
1946
+ path = node.parent._split_ref_to_path()
1947
+ if len(path) == 3:
1948
+ node.parent = self._update_ref_with_lookup(
1949
+ item_label=path[1], item_index=int(path[2]), lookup=lookup
1950
+ )
1951
+
1952
+ # Update the parent reference
1953
+ if node.self_ref is not None:
1954
+ path = node.self_ref.split("/")
1955
+ if len(path) == 3:
1956
+ _ref = self._update_ref_with_lookup(
1957
+ item_label=path[1], item_index=int(path[2]), lookup=lookup
1958
+ )
1959
+ node.self_ref = _ref.cref
1960
+
1961
+ # Update the child references
1962
+ node.children = self._update_refitems_with_lookup(
1963
+ ref_items=node.children,
1964
+ refs_to_be_deleted=refs_to_be_deleted,
1965
+ lookup=lookup,
1966
+ )
1967
+
1968
+ for i, child_ref in enumerate(node.children):
1969
+ node = child_ref.resolve(self)
1970
+ self._update_breadth_first_with_lookup(
1971
+ node=node, refs_to_be_deleted=refs_to_be_deleted, lookup=lookup
1972
+ )
1973
+
1658
1974
  ###################################
1659
1975
  # TODO: refactor add* methods below
1660
1976
  ###################################
@@ -2293,21 +2609,33 @@ class DoclingDocument(BaseModel):
2293
2609
  included_content_layers: Optional[set[ContentLayer]] = None,
2294
2610
  _level: int = 0, # fixed parameter, carries through the node nesting level
2295
2611
  ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
2296
- """iterate_elements.
2297
-
2298
- :param root: Optional[NodeItem]: (Default value = None)
2299
- :param with_groups: bool: (Default value = False)
2300
- :param traverse_pictures: bool: (Default value = False)
2301
- :param page_no: Optional[int]: (Default value = None)
2302
- :param _level: (Default value = 0)
2303
- :param # fixed parameter:
2304
- :param carries through the node nesting level:
2305
- """
2612
+ """Iterate elements with level."""
2613
+ for item, stack in self._iterate_items_with_stack(
2614
+ root=root,
2615
+ with_groups=with_groups,
2616
+ traverse_pictures=traverse_pictures,
2617
+ page_no=page_no,
2618
+ included_content_layers=included_content_layers,
2619
+ ):
2620
+ yield item, len(stack)
2621
+
2622
+ def _iterate_items_with_stack(
2623
+ self,
2624
+ root: Optional[NodeItem] = None,
2625
+ with_groups: bool = False,
2626
+ traverse_pictures: bool = False,
2627
+ page_no: Optional[int] = None,
2628
+ included_content_layers: Optional[set[ContentLayer]] = None,
2629
+ _stack: Optional[list[int]] = None,
2630
+ ) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
2631
+ """Iterate elements with stack."""
2306
2632
  my_layers = (
2307
2633
  included_content_layers
2308
2634
  if included_content_layers is not None
2309
2635
  else DEFAULT_CONTENT_LAYERS
2310
2636
  )
2637
+ my_stack: list[int] = _stack if _stack is not None else []
2638
+
2311
2639
  if not root:
2312
2640
  root = self.body
2313
2641
 
@@ -2327,25 +2655,31 @@ class DoclingDocument(BaseModel):
2327
2655
  )
2328
2656
 
2329
2657
  if should_yield:
2330
- yield root, _level
2658
+ yield root, my_stack
2331
2659
 
2332
2660
  # Handle picture traversal - only traverse children if requested
2333
2661
  if isinstance(root, PictureItem) and not traverse_pictures:
2334
2662
  return
2335
2663
 
2664
+ my_stack.append(-1)
2665
+
2336
2666
  # Traverse children
2337
- for child_ref in root.children:
2667
+ for child_ind, child_ref in enumerate(root.children):
2668
+ my_stack[-1] = child_ind
2338
2669
  child = child_ref.resolve(self)
2670
+
2339
2671
  if isinstance(child, NodeItem):
2340
- yield from self.iterate_items(
2672
+ yield from self._iterate_items_with_stack(
2341
2673
  child,
2342
2674
  with_groups=with_groups,
2343
2675
  traverse_pictures=traverse_pictures,
2344
2676
  page_no=page_no,
2345
- _level=_level + 1,
2677
+ _stack=my_stack,
2346
2678
  included_content_layers=my_layers,
2347
2679
  )
2348
2680
 
2681
+ my_stack.pop()
2682
+
2349
2683
  def _clear_picture_pil_cache(self):
2350
2684
  """Clear cache storage of all images."""
2351
2685
  for item, level in self.iterate_items(with_groups=False):
@@ -2618,6 +2952,7 @@ class DoclingDocument(BaseModel):
2618
2952
  strict_text: bool = False,
2619
2953
  escape_underscores: bool = True,
2620
2954
  image_placeholder: str = "<!-- image -->",
2955
+ enable_chart_tables: bool = True,
2621
2956
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2622
2957
  indent: int = 4,
2623
2958
  text_width: int = -1,
@@ -2685,6 +3020,7 @@ class DoclingDocument(BaseModel):
2685
3020
  stop_idx=to_element,
2686
3021
  escape_underscores=escape_underscores,
2687
3022
  image_placeholder=image_placeholder,
3023
+ enable_chart_tables=enable_chart_tables,
2688
3024
  image_mode=image_mode,
2689
3025
  indent=indent,
2690
3026
  wrap_width=text_width if text_width > 0 else None,
@@ -2735,12 +3071,14 @@ class DoclingDocument(BaseModel):
2735
3071
  formula_to_mathml: bool = True,
2736
3072
  page_no: Optional[int] = None,
2737
3073
  html_lang: str = "en",
2738
- html_head: str = _HTML_DEFAULT_HEAD,
3074
+ html_head: str = "null", # should be deprecated
2739
3075
  included_content_layers: Optional[set[ContentLayer]] = None,
3076
+ split_page_view: bool = False,
2740
3077
  ):
2741
3078
  """Save to HTML."""
2742
3079
  if isinstance(filename, str):
2743
3080
  filename = Path(filename)
3081
+
2744
3082
  artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
2745
3083
 
2746
3084
  if image_mode == ImageRefMode.REFERENCED:
@@ -2760,6 +3098,7 @@ class DoclingDocument(BaseModel):
2760
3098
  html_lang=html_lang,
2761
3099
  html_head=html_head,
2762
3100
  included_content_layers=included_content_layers,
3101
+ split_page_view=split_page_view,
2763
3102
  )
2764
3103
 
2765
3104
  with open(filename, "w", encoding="utf-8") as fw:
@@ -2808,245 +3147,51 @@ class DoclingDocument(BaseModel):
2808
3147
  formula_to_mathml: bool = True,
2809
3148
  page_no: Optional[int] = None,
2810
3149
  html_lang: str = "en",
2811
- html_head: str = _HTML_DEFAULT_HEAD,
3150
+ html_head: str = "null", # should be deprecated ...
2812
3151
  included_content_layers: Optional[set[ContentLayer]] = None,
3152
+ split_page_view: bool = False,
2813
3153
  ) -> str:
2814
3154
  r"""Serialize to HTML."""
2815
- my_labels = labels if labels is not None else DEFAULT_EXPORT_LABELS
3155
+ from docling_core.experimental.serializer.html import (
3156
+ HTMLDocSerializer,
3157
+ HTMLOutputStyle,
3158
+ HTMLParams,
3159
+ )
3160
+
3161
+ my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
2816
3162
  my_layers = (
2817
3163
  included_content_layers
2818
3164
  if included_content_layers is not None
2819
3165
  else DEFAULT_CONTENT_LAYERS
2820
3166
  )
2821
3167
 
2822
- def close_lists(
2823
- curr_level: int,
2824
- prev_level: int,
2825
- in_ordered_list: List[bool],
2826
- html_texts: list[str],
2827
- ):
2828
-
2829
- if len(in_ordered_list) == 0:
2830
- return (in_ordered_list, html_texts)
2831
-
2832
- while curr_level < prev_level and len(in_ordered_list) > 0:
2833
- if in_ordered_list[-1]:
2834
- html_texts.append("</ol>")
2835
- else:
2836
- html_texts.append("</ul>")
2837
-
2838
- prev_level -= 1
2839
- in_ordered_list.pop() # = in_ordered_list[:-1]
2840
-
2841
- return (in_ordered_list, html_texts)
2842
-
2843
- head_lines = [
2844
- "<!DOCTYPE html>",
2845
- f'<html lang="{html_lang}">',
2846
- html_head,
2847
- ]
2848
- html_texts: list[str] = []
2849
-
2850
- prev_level = 0 # Track the previous item's level
2851
-
2852
- in_ordered_list: List[bool] = [] # False
2853
-
2854
- def _prepare_tag_content(
2855
- text: str, do_escape_html=True, do_replace_newline=True
2856
- ) -> str:
2857
- if do_escape_html:
2858
- text = html.escape(text, quote=False)
2859
- if do_replace_newline:
2860
- text = text.replace("\n", "<br>")
2861
- return text
2862
-
2863
- for ix, (item, curr_level) in enumerate(
2864
- self.iterate_items(
2865
- self.body,
2866
- with_groups=True,
2867
- page_no=page_no,
2868
- included_content_layers=my_layers,
2869
- )
2870
- ):
2871
- # If we've moved to a lower level, we're exiting one or more groups
2872
- if curr_level < prev_level and len(in_ordered_list) > 0:
2873
- # Calculate how many levels we've exited
2874
- # level_difference = previous_level - level
2875
- # Decrement list_nesting_level for each list group we've exited
2876
- # list_nesting_level = max(0, list_nesting_level - level_difference)
2877
-
2878
- in_ordered_list, html_texts = close_lists(
2879
- curr_level=curr_level,
2880
- prev_level=prev_level,
2881
- in_ordered_list=in_ordered_list,
2882
- html_texts=html_texts,
2883
- )
2884
-
2885
- prev_level = curr_level # Update previous_level for next iteration
3168
+ output_style = HTMLOutputStyle.SINGLE_COLUMN
3169
+ if split_page_view:
3170
+ output_style = HTMLOutputStyle.SPLIT_PAGE
2886
3171
 
2887
- if ix < from_element or to_element <= ix:
2888
- continue # skip as many items as you want
2889
-
2890
- if (isinstance(item, DocItem)) and (item.label not in my_labels):
2891
- continue # skip any label that is not whitelisted
2892
-
2893
- if isinstance(item, GroupItem) and item.label in [
2894
- GroupLabel.ORDERED_LIST,
2895
- ]:
2896
-
2897
- text = "<ol>"
2898
- html_texts.append(text)
2899
-
2900
- # Increment list nesting level when entering a new list
2901
- in_ordered_list.append(True)
2902
-
2903
- elif isinstance(item, GroupItem) and item.label in [
2904
- GroupLabel.LIST,
2905
- ]:
2906
-
2907
- text = "<ul>"
2908
- html_texts.append(text)
2909
-
2910
- # Increment list nesting level when entering a new list
2911
- in_ordered_list.append(False)
2912
-
2913
- elif isinstance(item, GroupItem):
2914
- continue
2915
-
2916
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2917
- text_inner = _prepare_tag_content(item.text)
2918
- text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
2919
-
2920
- html_texts.append(text)
2921
-
2922
- elif isinstance(item, SectionHeaderItem):
2923
-
2924
- section_level: int = min(item.level + 1, 6)
2925
-
2926
- text = get_html_tag_with_text_direction(
2927
- html_tag=f"h{section_level}",
2928
- text=_prepare_tag_content(item.text),
2929
- )
2930
- html_texts.append(text)
2931
-
2932
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
2933
-
2934
- math_formula = _prepare_tag_content(
2935
- item.text, do_escape_html=False, do_replace_newline=False
2936
- )
2937
- text = ""
2938
-
2939
- def _image_fallback(item: TextItem):
2940
- item_image = item.get_image(doc=self)
2941
- if item_image is not None:
2942
- img_ref = ImageRef.from_pil(item_image, dpi=72)
2943
- return (
2944
- "<figure>"
2945
- f'<img src="{img_ref.uri}" alt="{item.orig}" />'
2946
- "</figure>"
2947
- )
2948
-
2949
- img_fallback = _image_fallback(item)
2950
-
2951
- # If the formula is not processed correcty, use its image
2952
- if (
2953
- item.text == ""
2954
- and item.orig != ""
2955
- and image_mode == ImageRefMode.EMBEDDED
2956
- and len(item.prov) > 0
2957
- and img_fallback is not None
2958
- ):
2959
- text = img_fallback
2960
-
2961
- # Building a math equation in MathML format
2962
- # ref https://www.w3.org/TR/wai-aria-1.1/#math
2963
- elif formula_to_mathml and len(math_formula) > 0:
2964
- try:
2965
- mathml_element = latex2mathml.converter.convert_to_element(
2966
- math_formula, display="block"
2967
- )
2968
- annotation = SubElement(
2969
- mathml_element, "annotation", dict(encoding="TeX")
2970
- )
2971
- annotation.text = math_formula
2972
- mathml = unescape(tostring(mathml_element, encoding="unicode"))
2973
- text = f"<div>{mathml}</div>"
2974
- except Exception as err:
2975
- _logger.warning(
2976
- "Malformed formula cannot be rendered. "
2977
- f"Error {err.__class__.__name__}, formula={math_formula}"
2978
- )
2979
- if (
2980
- image_mode == ImageRefMode.EMBEDDED
2981
- and len(item.prov) > 0
2982
- and img_fallback is not None
2983
- ):
2984
- text = img_fallback
2985
- else:
2986
- text = f"<pre>{math_formula}</pre>"
2987
-
2988
- elif math_formula != "":
2989
- text = f"<pre>{math_formula}</pre>"
2990
-
2991
- if text != "":
2992
- html_texts.append(text)
2993
- else:
2994
- html_texts.append(
2995
- '<div class="formula-not-decoded">Formula not decoded</div>'
2996
- )
2997
-
2998
- elif isinstance(item, ListItem):
2999
- text = get_html_tag_with_text_direction(
3000
- html_tag="li", text=_prepare_tag_content(item.text)
3001
- )
3002
- html_texts.append(text)
3003
-
3004
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
3005
- text = get_html_tag_with_text_direction(
3006
- html_tag="li", text=_prepare_tag_content(item.text)
3007
- )
3008
- html_texts.append(text)
3009
-
3010
- elif isinstance(item, CodeItem):
3011
- code_text = _prepare_tag_content(
3012
- item.text, do_escape_html=False, do_replace_newline=False
3013
- )
3014
- text = f"<pre><code>{code_text}</code></pre>"
3015
- html_texts.append(text)
3016
-
3017
- elif isinstance(item, TextItem):
3018
-
3019
- text = get_html_tag_with_text_direction(
3020
- html_tag="p", text=_prepare_tag_content(item.text)
3021
- )
3022
- html_texts.append(text)
3023
-
3024
- elif isinstance(item, TableItem):
3025
-
3026
- text = item.export_to_html(doc=self, add_caption=True)
3027
- html_texts.append(text)
3028
-
3029
- elif isinstance(item, PictureItem):
3030
-
3031
- html_texts.append(
3032
- item.export_to_html(
3033
- doc=self, add_caption=True, image_mode=image_mode
3034
- )
3035
- )
3036
-
3037
- elif isinstance(item, DocItem) and item.label in my_labels:
3038
- continue
3039
-
3040
- html_texts.append("</html>")
3172
+ params = HTMLParams(
3173
+ labels=my_labels,
3174
+ layers=my_layers,
3175
+ pages={page_no} if page_no is not None else None,
3176
+ start_idx=from_element,
3177
+ stop_idx=to_element,
3178
+ image_mode=image_mode,
3179
+ formula_to_mathml=formula_to_mathml,
3180
+ html_head=html_head,
3181
+ html_lang=html_lang,
3182
+ output_style=output_style,
3183
+ )
3041
3184
 
3042
- lines = []
3043
- lines.extend(head_lines)
3044
- lines.extend(html_texts)
3185
+ if html_head == "null":
3186
+ params.html_head = None
3045
3187
 
3046
- delim = "\n"
3047
- html_text = (delim.join(lines)).strip()
3188
+ serializer = HTMLDocSerializer(
3189
+ doc=self,
3190
+ params=params,
3191
+ )
3192
+ ser_res = serializer.serialize()
3048
3193
 
3049
- return html_text
3194
+ return ser_res.text
3050
3195
 
3051
3196
  def load_from_doctags( # noqa: C901
3052
3197
  self,
@@ -3077,6 +3222,8 @@ class DoclingDocument(BaseModel):
3077
3222
  def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
3078
3223
  """Extract <loc_...> coords from the chunk, normalized by / 500."""
3079
3224
  coords = re.findall(r"<loc_(\d+)>", text_chunk)
3225
+ if len(coords) > 4:
3226
+ coords = coords[:4]
3080
3227
  if len(coords) == 4:
3081
3228
  l, t, r, b = map(float, coords)
3082
3229
  return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
@@ -3107,11 +3254,28 @@ class DoclingDocument(BaseModel):
3107
3254
 
3108
3255
  def otsl_parse_texts(texts, tokens):
3109
3256
  split_word = TableToken.OTSL_NL.value
3257
+ # CLEAN tokens from extra tags, only structural OTSL allowed
3258
+ clean_tokens = []
3259
+ for t in tokens:
3260
+ if t in [
3261
+ TableToken.OTSL_ECEL.value,
3262
+ TableToken.OTSL_FCEL.value,
3263
+ TableToken.OTSL_LCEL.value,
3264
+ TableToken.OTSL_UCEL.value,
3265
+ TableToken.OTSL_XCEL.value,
3266
+ TableToken.OTSL_NL.value,
3267
+ TableToken.OTSL_CHED.value,
3268
+ TableToken.OTSL_RHED.value,
3269
+ TableToken.OTSL_SROW.value,
3270
+ ]:
3271
+ clean_tokens.append(t)
3272
+ tokens = clean_tokens
3110
3273
  split_row_tokens = [
3111
3274
  list(y)
3112
3275
  for x, y in itertools.groupby(tokens, lambda z: z == split_word)
3113
3276
  if not x
3114
3277
  ]
3278
+
3115
3279
  table_cells = []
3116
3280
  r_idx = 0
3117
3281
  c_idx = 0
@@ -3263,6 +3427,40 @@ class DoclingDocument(BaseModel):
3263
3427
  table_cells=table_cells,
3264
3428
  )
3265
3429
 
3430
+ def extract_chart_type(text_chunk: str):
3431
+ label = None
3432
+ chart_labels = [
3433
+ PictureClassificationLabel.PIE_CHART,
3434
+ PictureClassificationLabel.BAR_CHART,
3435
+ PictureClassificationLabel.STACKED_BAR_CHART,
3436
+ PictureClassificationLabel.LINE_CHART,
3437
+ PictureClassificationLabel.FLOW_CHART,
3438
+ PictureClassificationLabel.SCATTER_CHART,
3439
+ PictureClassificationLabel.HEATMAP,
3440
+ "line",
3441
+ "dot_line",
3442
+ "vbar_categorical",
3443
+ "hbar_categorical",
3444
+ ]
3445
+
3446
+ # Current SmolDocling can predict different labels:
3447
+ chart_labels_mapping = {
3448
+ "line": PictureClassificationLabel.LINE_CHART,
3449
+ "dot_line": PictureClassificationLabel.LINE_CHART,
3450
+ "vbar_categorical": PictureClassificationLabel.BAR_CHART,
3451
+ "hbar_categorical": PictureClassificationLabel.BAR_CHART,
3452
+ }
3453
+
3454
+ for clabel in chart_labels:
3455
+ tag = f"<{clabel}>"
3456
+ if tag in text_chunk:
3457
+ if clabel in chart_labels_mapping:
3458
+ label = PictureClassificationLabel(chart_labels_mapping[clabel])
3459
+ else:
3460
+ label = PictureClassificationLabel(clabel)
3461
+ break
3462
+ return label
3463
+
3266
3464
  def parse_key_value_item(
3267
3465
  tokens: str, image: Optional[PILImage.Image] = None
3268
3466
  ) -> Tuple[GraphData, Optional[ProvenanceItem]]:
@@ -3394,10 +3592,9 @@ class DoclingDocument(BaseModel):
3394
3592
  rf"{DocumentToken.ORDERED_LIST.value}|"
3395
3593
  rf"{DocumentToken.UNORDERED_LIST.value}|"
3396
3594
  rf"{DocItemLabel.KEY_VALUE_REGION}|"
3595
+ rf"{DocumentToken.CHART.value}|"
3397
3596
  rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
3398
3597
  )
3399
-
3400
- # DocumentToken.OTSL
3401
3598
  pattern = re.compile(tag_pattern, re.DOTALL)
3402
3599
 
3403
3600
  # Go through each match in order
@@ -3405,18 +3602,17 @@ class DoclingDocument(BaseModel):
3405
3602
  full_chunk = match.group(0)
3406
3603
  tag_name = match.group("tag")
3407
3604
 
3408
- bbox = extract_bounding_box(full_chunk) if image else None
3605
+ bbox = extract_bounding_box(full_chunk) # Extracts first bbox
3409
3606
  doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
3410
3607
 
3411
3608
  if tag_name == DocumentToken.OTSL.value:
3412
3609
  table_data = parse_table_content(full_chunk)
3413
- bbox = extract_bounding_box(full_chunk) if image else None
3414
3610
  caption, caption_bbox = extract_caption(full_chunk)
3415
3611
  if caption is not None and caption_bbox is not None:
3416
3612
  caption.prov.append(
3417
3613
  ProvenanceItem(
3418
3614
  bbox=caption_bbox.resize_by_scale(pg_width, pg_height),
3419
- charspan=(0, 0),
3615
+ charspan=(0, len(caption.text)),
3420
3616
  page_no=page_no,
3421
3617
  )
3422
3618
  )
@@ -3430,8 +3626,13 @@ class DoclingDocument(BaseModel):
3430
3626
  else:
3431
3627
  self.add_table(data=table_data, caption=caption)
3432
3628
 
3433
- elif tag_name == DocItemLabel.PICTURE:
3434
- text_caption_content = extract_inner_text(full_chunk)
3629
+ elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
3630
+ caption, caption_bbox = extract_caption(full_chunk)
3631
+ table_data = None
3632
+ chart_type = None
3633
+ if tag_name == DocumentToken.CHART.value:
3634
+ table_data = parse_table_content(full_chunk)
3635
+ chart_type = extract_chart_type(full_chunk)
3435
3636
  if image:
3436
3637
  if bbox:
3437
3638
  im_width, im_height = image.size
@@ -3455,30 +3656,77 @@ class DoclingDocument(BaseModel):
3455
3656
  ),
3456
3657
  )
3457
3658
  # If there is a caption to an image, add it as well
3458
- if len(text_caption_content) > 0:
3459
- caption_item = self.add_text(
3460
- label=DocItemLabel.CAPTION,
3461
- text=text_caption_content,
3462
- parent=None,
3659
+ if caption is not None and caption_bbox is not None:
3660
+ caption.prov.append(
3661
+ ProvenanceItem(
3662
+ bbox=caption_bbox.resize_by_scale(
3663
+ pg_width, pg_height
3664
+ ),
3665
+ charspan=(0, len(caption.text)),
3666
+ page_no=page_no,
3667
+ )
3463
3668
  )
3464
- pic.captions.append(caption_item.get_ref())
3669
+ pic.captions.append(caption.get_ref())
3670
+ pic_title = "picture"
3671
+ if chart_type is not None:
3672
+ pic.annotations.append(
3673
+ PictureClassificationData(
3674
+ provenance="load_from_doctags",
3675
+ predicted_classes=[
3676
+ # chart_type
3677
+ PictureClassificationClass(
3678
+ class_name=chart_type, confidence=1.0
3679
+ )
3680
+ ],
3681
+ )
3682
+ )
3683
+ pic_title = chart_type
3684
+ if table_data is not None:
3685
+ # Add chart data as PictureTabularChartData
3686
+ pd = PictureTabularChartData(
3687
+ chart_data=table_data, title=pic_title
3688
+ )
3689
+ pic.annotations.append(pd)
3465
3690
  else:
3466
3691
  if bbox:
3467
3692
  # In case we don't have access to an binary of an image
3468
- self.add_picture(
3693
+ pic = self.add_picture(
3469
3694
  parent=None,
3470
3695
  prov=ProvenanceItem(
3471
3696
  bbox=bbox, charspan=(0, 0), page_no=page_no
3472
3697
  ),
3473
3698
  )
3474
3699
  # If there is a caption to an image, add it as well
3475
- if len(text_caption_content) > 0:
3476
- caption_item = self.add_text(
3477
- label=DocItemLabel.CAPTION,
3478
- text=text_caption_content,
3479
- parent=None,
3700
+ if caption is not None and caption_bbox is not None:
3701
+ caption.prov.append(
3702
+ ProvenanceItem(
3703
+ bbox=caption_bbox.resize_by_scale(
3704
+ pg_width, pg_height
3705
+ ),
3706
+ charspan=(0, len(caption.text)),
3707
+ page_no=page_no,
3708
+ )
3709
+ )
3710
+ pic.captions.append(caption.get_ref())
3711
+ if chart_type is not None:
3712
+ pic.annotations.append(
3713
+ PictureClassificationData(
3714
+ provenance="load_from_doctags",
3715
+ predicted_classes=[
3716
+ # chart_type
3717
+ PictureClassificationClass(
3718
+ class_name=chart_type, confidence=1.0
3719
+ )
3720
+ ],
3721
+ )
3480
3722
  )
3481
- pic.captions.append(caption_item.get_ref())
3723
+ if table_data is not None:
3724
+ # Add chart data as PictureTabularChartData
3725
+ pd = PictureTabularChartData(
3726
+ chart_data=table_data, title=pic_title
3727
+ )
3728
+ pic.annotations.append(pd)
3729
+
3482
3730
  elif tag_name == DocItemLabel.KEY_VALUE_REGION:
3483
3731
  key_value_data, kv_item_prov = parse_key_value_item(
3484
3732
  full_chunk, image