docling-core 2.25.0__py3-none-any.whl → 2.26.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -3,7 +3,6 @@
3
3
  import base64
4
4
  import copy
5
5
  import hashlib
6
- import html
7
6
  import itertools
8
7
  import json
9
8
  import logging
@@ -12,17 +11,12 @@ import os
12
11
  import re
13
12
  import sys
14
13
  import typing
15
- import warnings
16
14
  from enum import Enum
17
15
  from io import BytesIO
18
16
  from pathlib import Path
19
17
  from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
20
- from urllib.parse import quote, unquote
21
- from xml.etree.cElementTree import SubElement, tostring
22
- from xml.sax.saxutils import unescape
18
+ from urllib.parse import unquote
23
19
 
24
- import latex2mathml.converter
25
- import latex2mathml.exceptions
26
20
  import pandas as pd
27
21
  import yaml
28
22
  from PIL import Image as PILImage
@@ -49,13 +43,10 @@ from docling_core.types.doc.labels import (
49
43
  GraphCellLabel,
50
44
  GraphLinkLabel,
51
45
  GroupLabel,
46
+ PictureClassificationLabel,
52
47
  )
53
48
  from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
54
- from docling_core.types.doc.utils import (
55
- get_html_tag_with_text_direction,
56
- get_text_direction,
57
- relative_path,
58
- )
49
+ from docling_core.types.doc.utils import relative_path
59
50
 
60
51
  _logger = logging.getLogger(__name__)
61
52
 
@@ -290,22 +281,6 @@ class PictureScatterChartData(PictureChartData):
290
281
  points: List[ChartPoint]
291
282
 
292
283
 
293
- PictureDataType = Annotated[
294
- Union[
295
- PictureClassificationData,
296
- PictureDescriptionData,
297
- PictureMoleculeData,
298
- PictureMiscData,
299
- PictureLineChartData,
300
- PictureBarChartData,
301
- PictureStackedBarChartData,
302
- PicturePieChartData,
303
- PictureScatterChartData,
304
- ],
305
- Field(discriminator="kind"),
306
- ]
307
-
308
-
309
284
  class TableCell(BaseModel):
310
285
  """TableCell."""
311
286
 
@@ -391,6 +366,35 @@ class TableData(BaseModel): # TBD
391
366
  return table_data
392
367
 
393
368
 
369
+ class PictureTabularChartData(PictureChartData):
370
+ """Base class for picture chart data.
371
+
372
+ Attributes:
373
+ title (str): The title of the chart.
374
+ chart_data (TableData): Chart data in the table format.
375
+ """
376
+
377
+ kind: Literal["tabular_chart_data"] = "tabular_chart_data"
378
+ chart_data: TableData
379
+
380
+
381
+ PictureDataType = Annotated[
382
+ Union[
383
+ PictureClassificationData,
384
+ PictureDescriptionData,
385
+ PictureMoleculeData,
386
+ PictureMiscData,
387
+ PictureTabularChartData,
388
+ PictureLineChartData,
389
+ PictureBarChartData,
390
+ PictureStackedBarChartData,
391
+ PicturePieChartData,
392
+ PictureScatterChartData,
393
+ ],
394
+ Field(discriminator="kind"),
395
+ ]
396
+
397
+
394
398
  class DocumentOrigin(BaseModel):
395
399
  """FileSource."""
396
400
 
@@ -458,8 +462,12 @@ class RefItem(BaseModel):
458
462
  populate_by_name=True,
459
463
  )
460
464
 
465
+ def _split_ref_to_path(self):
466
+ """Get the path of the reference."""
467
+ return self.cref.split("/")
468
+
461
469
  def resolve(self, doc: "DoclingDocument"):
462
- """resolve."""
470
+ """Resolve the path in the document."""
463
471
  path_components = self.cref.split("/")
464
472
  if (num_comps := len(path_components)) == 3:
465
473
  _, path, index_str = path_components
@@ -624,10 +632,98 @@ class NodeItem(BaseModel):
624
632
 
625
633
  model_config = ConfigDict(extra="forbid")
626
634
 
627
- def get_ref(self):
635
+ def get_ref(self) -> RefItem:
628
636
  """get_ref."""
629
637
  return RefItem(cref=self.self_ref)
630
638
 
639
+ def _get_parent_ref(
640
+ self, doc: "DoclingDocument", stack: list[int]
641
+ ) -> Optional[RefItem]:
642
+ """get_parent_ref."""
643
+ if len(stack) == 0:
644
+ return self.parent
645
+ elif len(stack) > 0 and stack[0] < len(self.children):
646
+ item = self.children[stack[0]].resolve(doc)
647
+ return item._get_parent_ref(doc=doc, stack=stack[1:])
648
+
649
+ return None
650
+
651
+ def _delete_child(self, doc: "DoclingDocument", stack: list[int]) -> bool:
652
+ """Delete child node in tree."""
653
+ if len(stack) == 1 and stack[0] < len(self.children):
654
+ del self.children[stack[0]]
655
+ return True
656
+ elif len(stack) > 1 and stack[0] < len(self.children):
657
+ item = self.children[stack[0]].resolve(doc)
658
+ return item._delete_child(doc=doc, stack=stack[1:])
659
+
660
+ return False
661
+
662
+ def _update_child(
663
+ self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem
664
+ ) -> bool:
665
+ """Update child node in tree."""
666
+ if len(stack) == 1 and stack[0] < len(self.children):
667
+ # ensure the parent is correct
668
+ new_item = new_ref.resolve(doc=doc)
669
+ new_item.parent = self.get_ref()
670
+
671
+ self.children[stack[0]] = new_ref
672
+ return True
673
+ elif len(stack) > 1 and stack[0] < len(self.children):
674
+ item = self.children[stack[0]].resolve(doc)
675
+ return item._update_child(doc=doc, stack=stack[1:], new_ref=new_ref)
676
+
677
+ return False
678
+
679
+ def _add_child(
680
+ self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem
681
+ ) -> bool:
682
+ """Append child to node identified by stack."""
683
+ if len(stack) == 0:
684
+
685
+ # ensure the parent is correct
686
+ new_item = new_ref.resolve(doc=doc)
687
+ new_item.parent = self.get_ref()
688
+
689
+ self.children.append(new_ref)
690
+ return True
691
+ elif len(stack) > 0 and stack[0] < len(self.children):
692
+ item = self.children[stack[0]].resolve(doc)
693
+ return item._add_child(doc=doc, stack=stack[1:], new_ref=new_ref)
694
+
695
+ return False
696
+
697
+ def _add_sibling(
698
+ self,
699
+ doc: "DoclingDocument",
700
+ stack: list[int],
701
+ new_ref: RefItem,
702
+ after: bool = True,
703
+ ) -> bool:
704
+ """Add sibling node in tree."""
705
+ if len(stack) == 1 and stack[0] < len(self.children) and (not after):
706
+ # ensure the parent is correct
707
+ new_item = new_ref.resolve(doc=doc)
708
+ new_item.parent = self.get_ref()
709
+
710
+ self.children.insert(stack[0], new_ref)
711
+ return True
712
+ elif len(stack) == 1 and stack[0] < len(self.children) and (after):
713
+ # ensure the parent is correct
714
+ new_item = new_ref.resolve(doc=doc)
715
+ new_item.parent = self.get_ref()
716
+
717
+ self.children.insert(stack[0] + 1, new_ref)
718
+ return True
719
+ elif len(stack) > 1 and stack[0] < len(self.children):
720
+ item = self.children[stack[0]].resolve(doc)
721
+ return item._add_sibling(
722
+ doc=doc, stack=stack[1:], new_ref=new_ref, after=after
723
+ )
724
+
725
+ return False
726
+
631
727
 
632
728
  class GroupItem(NodeItem): # Container type, can't be a leaf node
633
729
  """GroupItem."""
@@ -953,7 +1049,9 @@ class FormulaItem(TextItem):
953
1049
  class PictureItem(FloatingItem):
954
1050
  """PictureItem."""
955
1051
 
956
- label: typing.Literal[DocItemLabel.PICTURE] = DocItemLabel.PICTURE
1052
+ label: typing.Literal[DocItemLabel.PICTURE, DocItemLabel.CHART] = (
1053
+ DocItemLabel.PICTURE
1054
+ )
957
1055
 
958
1056
  annotations: List[PictureDataType] = []
959
1057
 
@@ -1020,54 +1118,19 @@ class PictureItem(FloatingItem):
1020
1118
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
1021
1119
  ) -> str:
1022
1120
  """Export picture to HTML format."""
1023
- text = ""
1024
- if add_caption and len(self.captions):
1025
- text = self.caption_text(doc)
1026
-
1027
- caption_text = ""
1028
- if len(text) > 0:
1029
- caption_text = get_html_tag_with_text_direction(
1030
- html_tag="figcaption", text=text
1031
- )
1032
-
1033
- default_response = f"<figure>{caption_text}</figure>"
1034
-
1035
- if image_mode == ImageRefMode.PLACEHOLDER:
1036
- return default_response
1037
-
1038
- elif image_mode == ImageRefMode.EMBEDDED:
1039
- # short-cut: we already have the image in base64
1040
- if (
1041
- isinstance(self.image, ImageRef)
1042
- and isinstance(self.image.uri, AnyUrl)
1043
- and self.image.uri.scheme == "data"
1044
- ):
1045
- img_text = f'<img src="{self.image.uri}">'
1046
- return f"<figure>{caption_text}{img_text}</figure>"
1047
-
1048
- # get the self.image._pil or crop it out of the page-image
1049
- img = self.get_image(doc)
1050
-
1051
- if img is not None:
1052
- imgb64 = self._image_to_base64(img)
1053
- img_text = f'<img src="data:image/png;base64,{imgb64}">'
1054
-
1055
- return f"<figure>{caption_text}{img_text}</figure>"
1056
- else:
1057
- return default_response
1058
-
1059
- elif image_mode == ImageRefMode.REFERENCED:
1060
-
1061
- if not isinstance(self.image, ImageRef) or (
1062
- isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
1063
- ):
1064
- return default_response
1065
-
1066
- img_text = f'<img src="{quote(str(self.image.uri))}">'
1067
- return f"<figure>{caption_text}{img_text}</figure>"
1121
+ from docling_core.experimental.serializer.html import (
1122
+ HTMLDocSerializer,
1123
+ HTMLParams,
1124
+ )
1068
1125
 
1069
- else:
1070
- return default_response
1126
+ serializer = HTMLDocSerializer(
1127
+ doc=doc,
1128
+ params=HTMLParams(
1129
+ image_mode=image_mode,
1130
+ ),
1131
+ )
1132
+ text = serializer.serialize(item=self).text
1133
+ return text
1071
1134
 
1072
1135
  @deprecated("Use export_to_doctags() instead.")
1073
1136
  def export_to_document_tokens(self, *args, **kwargs):
@@ -1218,81 +1281,18 @@ class TableItem(FloatingItem):
1218
1281
  add_caption: bool = True,
1219
1282
  ) -> str:
1220
1283
  """Export the table as html."""
1221
- if doc is None:
1222
- warnings.warn(
1223
- "The `doc` argument will be mandatory in a future version. "
1224
- "It must be provided to include a caption.",
1225
- DeprecationWarning,
1226
- )
1227
-
1228
- nrows = self.data.num_rows
1229
- ncols = self.data.num_cols
1230
-
1231
- text = ""
1232
- if doc is not None and add_caption and len(self.captions):
1233
- text = html.escape(self.caption_text(doc))
1234
-
1235
- if len(self.data.table_cells) == 0:
1236
- return ""
1237
-
1238
- body = ""
1239
-
1240
- for i in range(nrows):
1241
- body += "<tr>"
1242
- for j in range(ncols):
1243
- cell: TableCell = self.data.grid[i][j]
1244
-
1245
- rowspan, rowstart = (
1246
- cell.row_span,
1247
- cell.start_row_offset_idx,
1248
- )
1249
- colspan, colstart = (
1250
- cell.col_span,
1251
- cell.start_col_offset_idx,
1252
- )
1253
-
1254
- if rowstart != i:
1255
- continue
1256
- if colstart != j:
1257
- continue
1258
-
1259
- content = html.escape(cell.text.strip())
1260
- celltag = "td"
1261
- if cell.column_header:
1262
- celltag = "th"
1263
-
1264
- opening_tag = f"{celltag}"
1265
- if rowspan > 1:
1266
- opening_tag += f' rowspan="{rowspan}"'
1267
- if colspan > 1:
1268
- opening_tag += f' colspan="{colspan}"'
1269
-
1270
- text_dir = get_text_direction(content)
1271
- if text_dir == "rtl":
1272
- opening_tag += f' dir="{dir}"'
1273
-
1274
- body += f"<{opening_tag}>{content}</{celltag}>"
1275
- body += "</tr>"
1276
-
1277
- # dir = get_text_direction(text)
1278
-
1279
- if len(text) > 0 and len(body) > 0:
1280
- caption_text = get_html_tag_with_text_direction(
1281
- html_tag="caption", text=text
1282
- )
1283
- body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
1284
+ if doc is not None:
1285
+ from docling_core.experimental.serializer.html import HTMLDocSerializer
1284
1286
 
1285
- elif len(text) == 0 and len(body) > 0:
1286
- body = f"<table><tbody>{body}</tbody></table>"
1287
- elif len(text) > 0 and len(body) == 0:
1288
- caption_text = get_html_tag_with_text_direction(
1289
- html_tag="caption", text=text
1290
- )
1291
- body = f"<table>{caption_text}</table>"
1287
+ serializer = HTMLDocSerializer(doc=doc)
1288
+ text = serializer.serialize(item=self).text
1289
+ return text
1292
1290
  else:
1293
- body = "<table></table>"
1294
-
1295
- return body
1291
+ _logger.error(
1292
+ "Usage of TableItem.export_to_html() without `doc` argument is "
1293
+ "deprecated.",
1294
+ )
1295
+ return ""
1296
1296
 
1297
1297
  def export_to_otsl(
1298
1298
  self,
@@ -1567,76 +1567,6 @@ class PageItem(BaseModel):
1567
1567
  class DoclingDocument(BaseModel):
1568
1568
  """DoclingDocument."""
1569
1569
 
1570
- _HTML_DEFAULT_HEAD: str = r"""<head>
1571
- <link rel="icon" type="image/png"
1572
- href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
1573
- <meta charset="UTF-8">
1574
- <title>
1575
- Powered by Docling
1576
- </title>
1577
- <style>
1578
- html {
1579
- background-color: LightGray;
1580
- }
1581
- body {
1582
- margin: 0 auto;
1583
- width:800px;
1584
- padding: 30px;
1585
- background-color: White;
1586
- font-family: Arial, sans-serif;
1587
- box-shadow: 10px 10px 10px grey;
1588
- }
1589
- figure{
1590
- display: block;
1591
- width: 100%;
1592
- margin: 0px;
1593
- margin-top: 10px;
1594
- margin-bottom: 10px;
1595
- }
1596
- img {
1597
- display: block;
1598
- margin: auto;
1599
- margin-top: 10px;
1600
- margin-bottom: 10px;
1601
- max-width: 640px;
1602
- max-height: 640px;
1603
- }
1604
- table {
1605
- min-width:500px;
1606
- background-color: White;
1607
- border-collapse: collapse;
1608
- cell-padding: 5px;
1609
- margin: auto;
1610
- margin-top: 10px;
1611
- margin-bottom: 10px;
1612
- }
1613
- th, td {
1614
- border: 1px solid black;
1615
- padding: 8px;
1616
- }
1617
- th {
1618
- font-weight: bold;
1619
- }
1620
- table tr:nth-child(even) td{
1621
- background-color: LightGray;
1622
- }
1623
- math annotation {
1624
- display: none;
1625
- }
1626
- .formula-not-decoded {
1627
- background: repeating-linear-gradient(
1628
- 45deg, /* Angle of the stripes */
1629
- LightGray, /* First color */
1630
- LightGray 10px, /* Length of the first color */
1631
- White 10px, /* Second color */
1632
- White 20px /* Length of the second color */
1633
- );
1634
- margin: 0;
1635
- text-align: center;
1636
- }
1637
- </style>
1638
- </head>"""
1639
-
1640
1570
  schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
1641
1571
  version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
1642
1572
  CURRENT_VERSION
@@ -1683,6 +1613,364 @@ class DoclingDocument(BaseModel):
1683
1613
  item["content_layer"] = "furniture"
1684
1614
  return data
1685
1615
 
1616
+ # ---------------------------
1617
+ # Public Manipulation methods
1618
+ # ---------------------------
1619
+
1620
+ def append_child_item(
1621
+ self, *, child: NodeItem, parent: Optional[NodeItem] = None
1622
+ ) -> None:
1623
+ """Adds an item."""
1624
+ if len(child.children) > 0:
1625
+ raise ValueError("Can not append a child with children")
1626
+
1627
+ parent = parent if parent is not None else self.body
1628
+
1629
+ success, stack = self._get_stack_of_item(item=parent)
1630
+
1631
+ if not success:
1632
+ raise ValueError(
1633
+ f"Could not resolve the parent node in the document tree: {parent}"
1634
+ )
1635
+
1636
+ # Append the item to the attributes of the doc
1637
+ self._append_item(item=child, parent_ref=parent.get_ref())
1638
+
1639
+ # Update the tree of the doc
1640
+ success = self.body._add_child(doc=self, new_ref=child.get_ref(), stack=stack)
1641
+
1642
+ # Clean the attribute (orphan) if not successful
1643
+ if not success:
1644
+ self._pop_item(item=child)
1645
+ raise ValueError(f"Could not append child: {child} to parent: {parent}")
1646
+
1647
+ def insert_item_after_sibling(
1648
+ self, *, new_item: NodeItem, sibling: NodeItem
1649
+ ) -> None:
1650
+ """Inserts an item, given its node_item instance, after other as a sibling."""
1651
+ self._insert_item_at_refitem(item=new_item, ref=sibling.get_ref(), after=True)
1652
+
1653
+ def insert_item_before_sibling(
1654
+ self, *, new_item: NodeItem, sibling: NodeItem
1655
+ ) -> None:
1656
+ """Inserts an item, given its node_item instance, before other as a sibling."""
1657
+ self._insert_item_at_refitem(item=new_item, ref=sibling.get_ref(), after=False)
1658
+
1659
+ def delete_items(self, *, node_items: List[NodeItem]) -> None:
1660
+ """Deletes an item, given its instance or ref, and any children it has."""
1661
+ refs = []
1662
+ for _ in node_items:
1663
+ refs.append(_.get_ref())
1664
+
1665
+ self._delete_items(refs=refs)
1666
+
1667
+ def replace_item(self, *, new_item: NodeItem, old_item: NodeItem) -> None:
1668
+ """Replace item with new item."""
1669
+ self.insert_item_after_sibling(new_item=new_item, sibling=old_item)
1670
+ self.delete_items(node_items=[old_item])
1671
+
1672
+ # ----------------------------
1673
+ # Private Manipulation methods
1674
+ # ----------------------------
1675
+
1676
+ def _get_stack_of_item(self, item: NodeItem) -> tuple[bool, list[int]]:
1677
+ """Find the stack indices of the item."""
1678
+ return self._get_stack_of_refitem(ref=item.get_ref())
1679
+
1680
+ def _get_stack_of_refitem(self, ref: RefItem) -> tuple[bool, list[int]]:
1681
+ """Find the stack indices of the reference."""
1682
+ if ref == self.body.get_ref():
1683
+ return (True, [])
1684
+
1685
+ node = ref.resolve(doc=self)
1686
+ parent_ref = node._get_parent_ref(doc=self, stack=[])
1687
+
1688
+ if parent_ref is None:
1689
+ return (False, [])
1690
+
1691
+ stack: list[int] = []
1692
+ while parent_ref is not None:
1693
+ parent = parent_ref.resolve(doc=self)
1694
+
1695
+ index = parent.children.index(node.get_ref())
1696
+ stack.insert(0, index) # prepend the index
1697
+
1698
+ node = parent
1699
+ parent_ref = node._get_parent_ref(doc=self, stack=[])
1700
+
1701
+ return (True, stack)
1702
+
1703
+ def _insert_item_at_refitem(
1704
+ self, item: NodeItem, ref: RefItem, after: bool
1705
+ ) -> RefItem:
1706
+ """Insert node-item using the self-reference."""
1707
+ success, stack = self._get_stack_of_refitem(ref=ref)
1708
+
1709
+ if not success:
1710
+ raise ValueError(
1711
+ f"Could not insert at {ref.cref}: could not find the stack"
1712
+ )
1713
+
1714
+ return self._insert_item_at_stack(item=item, stack=stack, after=after)
1715
+
1716
+ def _append_item(self, *, item: NodeItem, parent_ref: RefItem) -> RefItem:
1717
+ """Append item of its type."""
1718
+ cref: str = "" # to be updated
1719
+
1720
+ if isinstance(item, TextItem):
1721
+ item_label = "texts"
1722
+ item_index = len(self.texts)
1723
+
1724
+ cref = f"#/{item_label}/{item_index}"
1725
+
1726
+ item.self_ref = cref
1727
+ item.parent = parent_ref
1728
+
1729
+ self.texts.append(item)
1730
+
1731
+ elif isinstance(item, TableItem):
1732
+ item_label = "tables"
1733
+ item_index = len(self.tables)
1734
+
1735
+ cref = f"#/{item_label}/{item_index}"
1736
+
1737
+ item.self_ref = cref
1738
+ item.parent = parent_ref
1739
+
1740
+ self.tables.append(item)
1741
+
1742
+ elif isinstance(item, PictureItem):
1743
+ item_label = "pictures"
1744
+ item_index = len(self.pictures)
1745
+
1746
+ cref = f"#/{item_label}/{item_index}"
1747
+
1748
+ item.self_ref = cref
1749
+ item.parent = parent_ref
1750
+
1751
+ self.pictures.append(item)
1752
+
1753
+ elif isinstance(item, KeyValueItem):
1754
+ item_label = "key_value_items"
1755
+ item_index = len(self.key_value_items)
1756
+
1757
+ cref = f"#/{item_label}/{item_index}"
1758
+
1759
+ item.self_ref = cref
1760
+ item.parent = parent_ref
1761
+
1762
+ self.key_value_items.append(item)
1763
+
1764
+ elif isinstance(item, FormItem):
1765
+ item_label = "form_items"
1766
+ item_index = len(self.form_items)
1767
+
1768
+ cref = f"#/{item_label}/{item_index}"
1769
+
1770
+ item.self_ref = cref
1771
+ item.parent = parent_ref
1772
+
1773
+ self.form_items.append(item)
1774
+ else:
1775
+ raise ValueError(f"Item {item} is not supported for insertion")
1776
+
1777
+ return RefItem(cref=cref)
1778
+
1779
+ def _pop_item(self, *, item: NodeItem):
1780
+ """Pop the last item of its type."""
1781
+ path = item.self_ref.split("/")
1782
+
1783
+ if len(path) != 3:
1784
+ raise ValueError(f"Can not pop item with path: {path}")
1785
+
1786
+ item_label = path[1]
1787
+ item_index = int(path[2])
1788
+
1789
+ if (
1790
+ len(self.__getattribute__(item_label)) + 1 == item_index
1791
+ ): # we can only pop the last item
1792
+ del self.__getattribute__(item_label)[item_index]
1793
+ else:
1794
+ msg = f"index:{item_index}, len:{len(self.__getattribute__(item_label))}"
1795
+ raise ValueError(f"Failed to pop: item is not last ({msg})")
1796
+
1797
+ def _insert_item_at_stack(
1798
+ self, item: NodeItem, stack: list[int], after: bool
1799
+ ) -> RefItem:
1800
+ """Insert node-item using the self-reference."""
1801
+ parent_ref = self.body._get_parent_ref(doc=self, stack=stack)
1802
+
1803
+ if parent_ref is None:
1804
+ raise ValueError(f"Could not find a parent at stack: {stack}")
1805
+
1806
+ new_ref = self._append_item(item=item, parent_ref=parent_ref)
1807
+
1808
+ success = self.body._add_sibling(
1809
+ doc=self, stack=stack, new_ref=new_ref, after=after
1810
+ )
1811
+
1812
+ if not success:
1813
+ self._pop_item(item=item)
1814
+
1815
+ return item.get_ref()
1816
+
1817
+ def _delete_items(self, refs: list[RefItem]) -> bool:
1818
+ """Delete document item using the self-reference."""
1819
+ to_be_deleted_items: dict[tuple[int, ...], str] = {} # stack to cref
1820
+
1821
+ # Identify the to_be_deleted_items
1822
+ for item, stack in self._iterate_items_with_stack(with_groups=True):
1823
+ ref = item.get_ref()
1824
+
1825
+ if ref in refs:
1826
+ to_be_deleted_items[tuple(stack)] = ref.cref
1827
+
1828
+ substacks = [stack[0 : i + 1] for i in range(len(stack) - 1)]
1829
+ for substack in substacks:
1830
+ if tuple(substack) in to_be_deleted_items:
1831
+ to_be_deleted_items[tuple(stack)] = ref.cref
1832
+
1833
+ if len(to_be_deleted_items) == 0:
1834
+ raise ValueError("Nothing to be deleted ...")
1835
+
1836
+ # Clean the tree, reverse the order to not have to update
1837
+ for stack_, ref_ in reversed(sorted(to_be_deleted_items.items())):
1838
+ success = self.body._delete_child(doc=self, stack=list(stack_))
1839
+
1840
+ if not success:
1841
+ del to_be_deleted_items[stack_]
1842
+ else:
1843
+ _logger.info(f"deleted item in tree at stack: {stack_} => {ref_}")
1844
+
1845
+ # Create a new lookup of the orphans:
1846
+ # dict of item_label (`texts`, `tables`, ...) to a
1847
+ # dict of item_label with delta (default = -1).
1848
+ lookup: dict[str, dict[int, int]] = {}
1849
+
1850
+ for stack_, ref_ in to_be_deleted_items.items():
1851
+ path = ref_.split("/")
1852
+ if len(path) == 3:
1853
+
1854
+ item_label = path[1]
1855
+ item_index = int(path[2])
1856
+
1857
+ if item_label not in lookup:
1858
+ lookup[item_label] = {}
1859
+
1860
+ lookup[item_label][item_index] = -1
1861
+
1862
+ # Remove the orphans in reverse order
1863
+ for item_label, item_inds in lookup.items():
1864
+ for item_index, val in reversed(
1865
+ sorted(item_inds.items())
1866
+ ): # make sure you delete the last in the list first!
1867
+ _logger.debug(f"deleting item in doc for {item_label} for {item_index}")
1868
+ del self.__getattribute__(item_label)[item_index]
1869
+
1870
+ self._update_breadth_first_with_lookup(
1871
+ node=self.body, refs_to_be_deleted=refs, lookup=lookup
1872
+ )
1873
+
1874
+ return True
1875
+
1876
+ # Update the references
1877
+ def _update_ref_with_lookup(
1878
+ self, item_label: str, item_index: int, lookup: dict[str, dict[int, int]]
1879
+ ) -> RefItem:
1880
+ """Update ref with lookup."""
1881
+ if item_label not in lookup: # Nothing to be done
1882
+ return RefItem(cref=f"#/{item_label}/{item_index}")
1883
+
1884
+ # Count how many items have been deleted in front of you
1885
+ delta = sum(
1886
+ val if item_index >= key else 0 for key, val in lookup[item_label].items()
1887
+ )
1888
+ new_index = item_index + delta
1889
+
1890
+ return RefItem(cref=f"#/{item_label}/{new_index}")
1891
+
1892
+ def _update_refitems_with_lookup(
1893
+ self,
1894
+ ref_items: list[RefItem],
1895
+ refs_to_be_deleted: list[RefItem],
1896
+ lookup: dict[str, dict[int, int]],
1897
+ ) -> list[RefItem]:
1898
+ """Update refitems with lookup."""
1899
+ new_refitems = []
1900
+ for ref_item in ref_items:
1901
+
1902
+ if (
1903
+ ref_item not in refs_to_be_deleted
1904
+ ): # if ref_item is in ref, then delete/skip them
1905
+ path = ref_item._split_ref_to_path()
1906
+ if len(path) == 3:
1907
+ new_refitems.append(
1908
+ self._update_ref_with_lookup(
1909
+ item_label=path[1],
1910
+ item_index=int(path[2]),
1911
+ lookup=lookup,
1912
+ )
1913
+ )
1914
+ else:
1915
+ new_refitems.append(ref_item)
1916
+
1917
+ return new_refitems
1918
+
1919
+ def _update_breadth_first_with_lookup(
1920
+ self,
1921
+ node: NodeItem,
1922
+ refs_to_be_deleted: list[RefItem],
1923
+ lookup: dict[str, dict[int, int]],
1924
+ ):
1925
+ """Update breadth first with lookup."""
1926
+ # Update the captions, references and footnote references
1927
+ if isinstance(node, FloatingItem):
1928
+ node.captions = self._update_refitems_with_lookup(
1929
+ ref_items=node.captions,
1930
+ refs_to_be_deleted=refs_to_be_deleted,
1931
+ lookup=lookup,
1932
+ )
1933
+ node.references = self._update_refitems_with_lookup(
1934
+ ref_items=node.references,
1935
+ refs_to_be_deleted=refs_to_be_deleted,
1936
+ lookup=lookup,
1937
+ )
1938
+ node.footnotes = self._update_refitems_with_lookup(
1939
+ ref_items=node.footnotes,
1940
+ refs_to_be_deleted=refs_to_be_deleted,
1941
+ lookup=lookup,
1942
+ )
1943
+
1944
+ # Update the self_ref reference
1945
+ if node.parent is not None:
1946
+ path = node.parent._split_ref_to_path()
1947
+ if len(path) == 3:
1948
+ node.parent = self._update_ref_with_lookup(
1949
+ item_label=path[1], item_index=int(path[2]), lookup=lookup
1950
+ )
1951
+
1952
+ # Update the parent reference
1953
+ if node.self_ref is not None:
1954
+ path = node.self_ref.split("/")
1955
+ if len(path) == 3:
1956
+ _ref = self._update_ref_with_lookup(
1957
+ item_label=path[1], item_index=int(path[2]), lookup=lookup
1958
+ )
1959
+ node.self_ref = _ref.cref
1960
+
1961
+ # Update the child references
1962
+ node.children = self._update_refitems_with_lookup(
1963
+ ref_items=node.children,
1964
+ refs_to_be_deleted=refs_to_be_deleted,
1965
+ lookup=lookup,
1966
+ )
1967
+
1968
+ for i, child_ref in enumerate(node.children):
1969
+ node = child_ref.resolve(self)
1970
+ self._update_breadth_first_with_lookup(
1971
+ node=node, refs_to_be_deleted=refs_to_be_deleted, lookup=lookup
1972
+ )
1973
+
1686
1974
  ###################################
1687
1975
  # TODO: refactor add* methods below
1688
1976
  ###################################
@@ -2321,21 +2609,33 @@ class DoclingDocument(BaseModel):
2321
2609
  included_content_layers: Optional[set[ContentLayer]] = None,
2322
2610
  _level: int = 0, # fixed parameter, carries through the node nesting level
2323
2611
  ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
2324
- """iterate_elements.
2325
-
2326
- :param root: Optional[NodeItem]: (Default value = None)
2327
- :param with_groups: bool: (Default value = False)
2328
- :param traverse_pictures: bool: (Default value = False)
2329
- :param page_no: Optional[int]: (Default value = None)
2330
- :param _level: (Default value = 0)
2331
- :param # fixed parameter:
2332
- :param carries through the node nesting level:
2333
- """
2612
+ """Iterate elements with level."""
2613
+ for item, stack in self._iterate_items_with_stack(
2614
+ root=root,
2615
+ with_groups=with_groups,
2616
+ traverse_pictures=traverse_pictures,
2617
+ page_no=page_no,
2618
+ included_content_layers=included_content_layers,
2619
+ ):
2620
+ yield item, len(stack)
2621
+
2622
+ def _iterate_items_with_stack(
2623
+ self,
2624
+ root: Optional[NodeItem] = None,
2625
+ with_groups: bool = False,
2626
+ traverse_pictures: bool = False,
2627
+ page_no: Optional[int] = None,
2628
+ included_content_layers: Optional[set[ContentLayer]] = None,
2629
+ _stack: Optional[list[int]] = None,
2630
+ ) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
2631
+ """Iterate elements with stack."""
2334
2632
  my_layers = (
2335
2633
  included_content_layers
2336
2634
  if included_content_layers is not None
2337
2635
  else DEFAULT_CONTENT_LAYERS
2338
2636
  )
2637
+ my_stack: list[int] = _stack if _stack is not None else []
2638
+
2339
2639
  if not root:
2340
2640
  root = self.body
2341
2641
 
@@ -2355,25 +2655,31 @@ class DoclingDocument(BaseModel):
2355
2655
  )
2356
2656
 
2357
2657
  if should_yield:
2358
- yield root, _level
2658
+ yield root, my_stack
2359
2659
 
2360
2660
  # Handle picture traversal - only traverse children if requested
2361
2661
  if isinstance(root, PictureItem) and not traverse_pictures:
2362
2662
  return
2363
2663
 
2664
+ my_stack.append(-1)
2665
+
2364
2666
  # Traverse children
2365
- for child_ref in root.children:
2667
+ for child_ind, child_ref in enumerate(root.children):
2668
+ my_stack[-1] = child_ind
2366
2669
  child = child_ref.resolve(self)
2670
+
2367
2671
  if isinstance(child, NodeItem):
2368
- yield from self.iterate_items(
2672
+ yield from self._iterate_items_with_stack(
2369
2673
  child,
2370
2674
  with_groups=with_groups,
2371
2675
  traverse_pictures=traverse_pictures,
2372
2676
  page_no=page_no,
2373
- _level=_level + 1,
2677
+ _stack=my_stack,
2374
2678
  included_content_layers=my_layers,
2375
2679
  )
2376
2680
 
2681
+ my_stack.pop()
2682
+
2377
2683
  def _clear_picture_pil_cache(self):
2378
2684
  """Clear cache storage of all images."""
2379
2685
  for item, level in self.iterate_items(with_groups=False):
@@ -2646,6 +2952,7 @@ class DoclingDocument(BaseModel):
2646
2952
  strict_text: bool = False,
2647
2953
  escape_underscores: bool = True,
2648
2954
  image_placeholder: str = "<!-- image -->",
2955
+ enable_chart_tables: bool = True,
2649
2956
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2650
2957
  indent: int = 4,
2651
2958
  text_width: int = -1,
@@ -2713,6 +3020,7 @@ class DoclingDocument(BaseModel):
2713
3020
  stop_idx=to_element,
2714
3021
  escape_underscores=escape_underscores,
2715
3022
  image_placeholder=image_placeholder,
3023
+ enable_chart_tables=enable_chart_tables,
2716
3024
  image_mode=image_mode,
2717
3025
  indent=indent,
2718
3026
  wrap_width=text_width if text_width > 0 else None,
@@ -2763,12 +3071,14 @@ class DoclingDocument(BaseModel):
2763
3071
  formula_to_mathml: bool = True,
2764
3072
  page_no: Optional[int] = None,
2765
3073
  html_lang: str = "en",
2766
- html_head: str = _HTML_DEFAULT_HEAD,
3074
+ html_head: str = "null", # should be deprecated
2767
3075
  included_content_layers: Optional[set[ContentLayer]] = None,
3076
+ split_page_view: bool = False,
2768
3077
  ):
2769
3078
  """Save to HTML."""
2770
3079
  if isinstance(filename, str):
2771
3080
  filename = Path(filename)
3081
+
2772
3082
  artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
2773
3083
 
2774
3084
  if image_mode == ImageRefMode.REFERENCED:
@@ -2788,6 +3098,7 @@ class DoclingDocument(BaseModel):
2788
3098
  html_lang=html_lang,
2789
3099
  html_head=html_head,
2790
3100
  included_content_layers=included_content_layers,
3101
+ split_page_view=split_page_view,
2791
3102
  )
2792
3103
 
2793
3104
  with open(filename, "w", encoding="utf-8") as fw:
@@ -2836,245 +3147,51 @@ class DoclingDocument(BaseModel):
2836
3147
  formula_to_mathml: bool = True,
2837
3148
  page_no: Optional[int] = None,
2838
3149
  html_lang: str = "en",
2839
- html_head: str = _HTML_DEFAULT_HEAD,
3150
+ html_head: str = "null", # should be deprecated ...
2840
3151
  included_content_layers: Optional[set[ContentLayer]] = None,
3152
+ split_page_view: bool = False,
2841
3153
  ) -> str:
2842
3154
  r"""Serialize to HTML."""
2843
- my_labels = labels if labels is not None else DEFAULT_EXPORT_LABELS
3155
+ from docling_core.experimental.serializer.html import (
3156
+ HTMLDocSerializer,
3157
+ HTMLOutputStyle,
3158
+ HTMLParams,
3159
+ )
3160
+
3161
+ my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
2844
3162
  my_layers = (
2845
3163
  included_content_layers
2846
3164
  if included_content_layers is not None
2847
3165
  else DEFAULT_CONTENT_LAYERS
2848
3166
  )
2849
3167
 
2850
- def close_lists(
2851
- curr_level: int,
2852
- prev_level: int,
2853
- in_ordered_list: List[bool],
2854
- html_texts: list[str],
2855
- ):
2856
-
2857
- if len(in_ordered_list) == 0:
2858
- return (in_ordered_list, html_texts)
2859
-
2860
- while curr_level < prev_level and len(in_ordered_list) > 0:
2861
- if in_ordered_list[-1]:
2862
- html_texts.append("</ol>")
2863
- else:
2864
- html_texts.append("</ul>")
2865
-
2866
- prev_level -= 1
2867
- in_ordered_list.pop() # = in_ordered_list[:-1]
2868
-
2869
- return (in_ordered_list, html_texts)
2870
-
2871
- head_lines = [
2872
- "<!DOCTYPE html>",
2873
- f'<html lang="{html_lang}">',
2874
- html_head,
2875
- ]
2876
- html_texts: list[str] = []
2877
-
2878
- prev_level = 0 # Track the previous item's level
2879
-
2880
- in_ordered_list: List[bool] = [] # False
2881
-
2882
- def _prepare_tag_content(
2883
- text: str, do_escape_html=True, do_replace_newline=True
2884
- ) -> str:
2885
- if do_escape_html:
2886
- text = html.escape(text, quote=False)
2887
- if do_replace_newline:
2888
- text = text.replace("\n", "<br>")
2889
- return text
2890
-
2891
- for ix, (item, curr_level) in enumerate(
2892
- self.iterate_items(
2893
- self.body,
2894
- with_groups=True,
2895
- page_no=page_no,
2896
- included_content_layers=my_layers,
2897
- )
2898
- ):
2899
- # If we've moved to a lower level, we're exiting one or more groups
2900
- if curr_level < prev_level and len(in_ordered_list) > 0:
2901
- # Calculate how many levels we've exited
2902
- # level_difference = previous_level - level
2903
- # Decrement list_nesting_level for each list group we've exited
2904
- # list_nesting_level = max(0, list_nesting_level - level_difference)
2905
-
2906
- in_ordered_list, html_texts = close_lists(
2907
- curr_level=curr_level,
2908
- prev_level=prev_level,
2909
- in_ordered_list=in_ordered_list,
2910
- html_texts=html_texts,
2911
- )
2912
-
2913
- prev_level = curr_level # Update previous_level for next iteration
2914
-
2915
- if ix < from_element or to_element <= ix:
2916
- continue # skip as many items as you want
2917
-
2918
- if (isinstance(item, DocItem)) and (item.label not in my_labels):
2919
- continue # skip any label that is not whitelisted
3168
+ output_style = HTMLOutputStyle.SINGLE_COLUMN
3169
+ if split_page_view:
3170
+ output_style = HTMLOutputStyle.SPLIT_PAGE
2920
3171
 
2921
- if isinstance(item, GroupItem) and item.label in [
2922
- GroupLabel.ORDERED_LIST,
2923
- ]:
2924
-
2925
- text = "<ol>"
2926
- html_texts.append(text)
2927
-
2928
- # Increment list nesting level when entering a new list
2929
- in_ordered_list.append(True)
2930
-
2931
- elif isinstance(item, GroupItem) and item.label in [
2932
- GroupLabel.LIST,
2933
- ]:
2934
-
2935
- text = "<ul>"
2936
- html_texts.append(text)
2937
-
2938
- # Increment list nesting level when entering a new list
2939
- in_ordered_list.append(False)
2940
-
2941
- elif isinstance(item, GroupItem):
2942
- continue
2943
-
2944
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2945
- text_inner = _prepare_tag_content(item.text)
2946
- text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
2947
-
2948
- html_texts.append(text)
2949
-
2950
- elif isinstance(item, SectionHeaderItem):
2951
-
2952
- section_level: int = min(item.level + 1, 6)
2953
-
2954
- text = get_html_tag_with_text_direction(
2955
- html_tag=f"h{section_level}",
2956
- text=_prepare_tag_content(item.text),
2957
- )
2958
- html_texts.append(text)
2959
-
2960
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
2961
-
2962
- math_formula = _prepare_tag_content(
2963
- item.text, do_escape_html=False, do_replace_newline=False
2964
- )
2965
- text = ""
2966
-
2967
- def _image_fallback(item: TextItem):
2968
- item_image = item.get_image(doc=self)
2969
- if item_image is not None:
2970
- img_ref = ImageRef.from_pil(item_image, dpi=72)
2971
- return (
2972
- "<figure>"
2973
- f'<img src="{img_ref.uri}" alt="{item.orig}" />'
2974
- "</figure>"
2975
- )
2976
-
2977
- img_fallback = _image_fallback(item)
2978
-
2979
- # If the formula is not processed correcty, use its image
2980
- if (
2981
- item.text == ""
2982
- and item.orig != ""
2983
- and image_mode == ImageRefMode.EMBEDDED
2984
- and len(item.prov) > 0
2985
- and img_fallback is not None
2986
- ):
2987
- text = img_fallback
2988
-
2989
- # Building a math equation in MathML format
2990
- # ref https://www.w3.org/TR/wai-aria-1.1/#math
2991
- elif formula_to_mathml and len(math_formula) > 0:
2992
- try:
2993
- mathml_element = latex2mathml.converter.convert_to_element(
2994
- math_formula, display="block"
2995
- )
2996
- annotation = SubElement(
2997
- mathml_element, "annotation", dict(encoding="TeX")
2998
- )
2999
- annotation.text = math_formula
3000
- mathml = unescape(tostring(mathml_element, encoding="unicode"))
3001
- text = f"<div>{mathml}</div>"
3002
- except Exception as err:
3003
- _logger.warning(
3004
- "Malformed formula cannot be rendered. "
3005
- f"Error {err.__class__.__name__}, formula={math_formula}"
3006
- )
3007
- if (
3008
- image_mode == ImageRefMode.EMBEDDED
3009
- and len(item.prov) > 0
3010
- and img_fallback is not None
3011
- ):
3012
- text = img_fallback
3013
- else:
3014
- text = f"<pre>{math_formula}</pre>"
3015
-
3016
- elif math_formula != "":
3017
- text = f"<pre>{math_formula}</pre>"
3018
-
3019
- if text != "":
3020
- html_texts.append(text)
3021
- else:
3022
- html_texts.append(
3023
- '<div class="formula-not-decoded">Formula not decoded</div>'
3024
- )
3025
-
3026
- elif isinstance(item, ListItem):
3027
- text = get_html_tag_with_text_direction(
3028
- html_tag="li", text=_prepare_tag_content(item.text)
3029
- )
3030
- html_texts.append(text)
3031
-
3032
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
3033
- text = get_html_tag_with_text_direction(
3034
- html_tag="li", text=_prepare_tag_content(item.text)
3035
- )
3036
- html_texts.append(text)
3037
-
3038
- elif isinstance(item, CodeItem):
3039
- code_text = _prepare_tag_content(
3040
- item.text, do_escape_html=False, do_replace_newline=False
3041
- )
3042
- text = f"<pre><code>{code_text}</code></pre>"
3043
- html_texts.append(text)
3044
-
3045
- elif isinstance(item, TextItem):
3046
-
3047
- text = get_html_tag_with_text_direction(
3048
- html_tag="p", text=_prepare_tag_content(item.text)
3049
- )
3050
- html_texts.append(text)
3051
-
3052
- elif isinstance(item, TableItem):
3053
-
3054
- text = item.export_to_html(doc=self, add_caption=True)
3055
- html_texts.append(text)
3056
-
3057
- elif isinstance(item, PictureItem):
3058
-
3059
- html_texts.append(
3060
- item.export_to_html(
3061
- doc=self, add_caption=True, image_mode=image_mode
3062
- )
3063
- )
3064
-
3065
- elif isinstance(item, DocItem) and item.label in my_labels:
3066
- continue
3067
-
3068
- html_texts.append("</html>")
3172
+ params = HTMLParams(
3173
+ labels=my_labels,
3174
+ layers=my_layers,
3175
+ pages={page_no} if page_no is not None else None,
3176
+ start_idx=from_element,
3177
+ stop_idx=to_element,
3178
+ image_mode=image_mode,
3179
+ formula_to_mathml=formula_to_mathml,
3180
+ html_head=html_head,
3181
+ html_lang=html_lang,
3182
+ output_style=output_style,
3183
+ )
3069
3184
 
3070
- lines = []
3071
- lines.extend(head_lines)
3072
- lines.extend(html_texts)
3185
+ if html_head == "null":
3186
+ params.html_head = None
3073
3187
 
3074
- delim = "\n"
3075
- html_text = (delim.join(lines)).strip()
3188
+ serializer = HTMLDocSerializer(
3189
+ doc=self,
3190
+ params=params,
3191
+ )
3192
+ ser_res = serializer.serialize()
3076
3193
 
3077
- return html_text
3194
+ return ser_res.text
3078
3195
 
3079
3196
  def load_from_doctags( # noqa: C901
3080
3197
  self,
@@ -3105,6 +3222,8 @@ class DoclingDocument(BaseModel):
3105
3222
  def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
3106
3223
  """Extract <loc_...> coords from the chunk, normalized by / 500."""
3107
3224
  coords = re.findall(r"<loc_(\d+)>", text_chunk)
3225
+ if len(coords) > 4:
3226
+ coords = coords[:4]
3108
3227
  if len(coords) == 4:
3109
3228
  l, t, r, b = map(float, coords)
3110
3229
  return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
@@ -3135,11 +3254,28 @@ class DoclingDocument(BaseModel):
3135
3254
 
3136
3255
  def otsl_parse_texts(texts, tokens):
3137
3256
  split_word = TableToken.OTSL_NL.value
3257
+ # CLEAN tokens from extra tags, only structural OTSL allowed
3258
+ clean_tokens = []
3259
+ for t in tokens:
3260
+ if t in [
3261
+ TableToken.OTSL_ECEL.value,
3262
+ TableToken.OTSL_FCEL.value,
3263
+ TableToken.OTSL_LCEL.value,
3264
+ TableToken.OTSL_UCEL.value,
3265
+ TableToken.OTSL_XCEL.value,
3266
+ TableToken.OTSL_NL.value,
3267
+ TableToken.OTSL_CHED.value,
3268
+ TableToken.OTSL_RHED.value,
3269
+ TableToken.OTSL_SROW.value,
3270
+ ]:
3271
+ clean_tokens.append(t)
3272
+ tokens = clean_tokens
3138
3273
  split_row_tokens = [
3139
3274
  list(y)
3140
3275
  for x, y in itertools.groupby(tokens, lambda z: z == split_word)
3141
3276
  if not x
3142
3277
  ]
3278
+
3143
3279
  table_cells = []
3144
3280
  r_idx = 0
3145
3281
  c_idx = 0
@@ -3291,6 +3427,40 @@ class DoclingDocument(BaseModel):
3291
3427
  table_cells=table_cells,
3292
3428
  )
3293
3429
 
3430
+ def extract_chart_type(text_chunk: str):
3431
+ label = None
3432
+ chart_labels = [
3433
+ PictureClassificationLabel.PIE_CHART,
3434
+ PictureClassificationLabel.BAR_CHART,
3435
+ PictureClassificationLabel.STACKED_BAR_CHART,
3436
+ PictureClassificationLabel.LINE_CHART,
3437
+ PictureClassificationLabel.FLOW_CHART,
3438
+ PictureClassificationLabel.SCATTER_CHART,
3439
+ PictureClassificationLabel.HEATMAP,
3440
+ "line",
3441
+ "dot_line",
3442
+ "vbar_categorical",
3443
+ "hbar_categorical",
3444
+ ]
3445
+
3446
+ # Current SmolDocling can predict different labels:
3447
+ chart_labels_mapping = {
3448
+ "line": PictureClassificationLabel.LINE_CHART,
3449
+ "dot_line": PictureClassificationLabel.LINE_CHART,
3450
+ "vbar_categorical": PictureClassificationLabel.BAR_CHART,
3451
+ "hbar_categorical": PictureClassificationLabel.BAR_CHART,
3452
+ }
3453
+
3454
+ for clabel in chart_labels:
3455
+ tag = f"<{clabel}>"
3456
+ if tag in text_chunk:
3457
+ if clabel in chart_labels_mapping:
3458
+ label = PictureClassificationLabel(chart_labels_mapping[clabel])
3459
+ else:
3460
+ label = PictureClassificationLabel(clabel)
3461
+ break
3462
+ return label
3463
+
3294
3464
  def parse_key_value_item(
3295
3465
  tokens: str, image: Optional[PILImage.Image] = None
3296
3466
  ) -> Tuple[GraphData, Optional[ProvenanceItem]]:
@@ -3422,10 +3592,9 @@ class DoclingDocument(BaseModel):
3422
3592
  rf"{DocumentToken.ORDERED_LIST.value}|"
3423
3593
  rf"{DocumentToken.UNORDERED_LIST.value}|"
3424
3594
  rf"{DocItemLabel.KEY_VALUE_REGION}|"
3595
+ rf"{DocumentToken.CHART.value}|"
3425
3596
  rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
3426
3597
  )
3427
-
3428
- # DocumentToken.OTSL
3429
3598
  pattern = re.compile(tag_pattern, re.DOTALL)
3430
3599
 
3431
3600
  # Go through each match in order
@@ -3433,18 +3602,17 @@ class DoclingDocument(BaseModel):
3433
3602
  full_chunk = match.group(0)
3434
3603
  tag_name = match.group("tag")
3435
3604
 
3436
- bbox = extract_bounding_box(full_chunk) if image else None
3605
+ bbox = extract_bounding_box(full_chunk) # Extracts first bbox
3437
3606
  doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
3438
3607
 
3439
3608
  if tag_name == DocumentToken.OTSL.value:
3440
3609
  table_data = parse_table_content(full_chunk)
3441
- bbox = extract_bounding_box(full_chunk) if image else None
3442
3610
  caption, caption_bbox = extract_caption(full_chunk)
3443
3611
  if caption is not None and caption_bbox is not None:
3444
3612
  caption.prov.append(
3445
3613
  ProvenanceItem(
3446
3614
  bbox=caption_bbox.resize_by_scale(pg_width, pg_height),
3447
- charspan=(0, 0),
3615
+ charspan=(0, len(caption.text)),
3448
3616
  page_no=page_no,
3449
3617
  )
3450
3618
  )
@@ -3458,8 +3626,13 @@ class DoclingDocument(BaseModel):
3458
3626
  else:
3459
3627
  self.add_table(data=table_data, caption=caption)
3460
3628
 
3461
- elif tag_name == DocItemLabel.PICTURE:
3462
- text_caption_content = extract_inner_text(full_chunk)
3629
+ elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
3630
+ caption, caption_bbox = extract_caption(full_chunk)
3631
+ table_data = None
3632
+ chart_type = None
3633
+ if tag_name == DocumentToken.CHART.value:
3634
+ table_data = parse_table_content(full_chunk)
3635
+ chart_type = extract_chart_type(full_chunk)
3463
3636
  if image:
3464
3637
  if bbox:
3465
3638
  im_width, im_height = image.size
@@ -3483,30 +3656,77 @@ class DoclingDocument(BaseModel):
3483
3656
  ),
3484
3657
  )
3485
3658
  # If there is a caption to an image, add it as well
3486
- if len(text_caption_content) > 0:
3487
- caption_item = self.add_text(
3488
- label=DocItemLabel.CAPTION,
3489
- text=text_caption_content,
3490
- parent=None,
3659
+ if caption is not None and caption_bbox is not None:
3660
+ caption.prov.append(
3661
+ ProvenanceItem(
3662
+ bbox=caption_bbox.resize_by_scale(
3663
+ pg_width, pg_height
3664
+ ),
3665
+ charspan=(0, len(caption.text)),
3666
+ page_no=page_no,
3667
+ )
3491
3668
  )
3492
- pic.captions.append(caption_item.get_ref())
3669
+ pic.captions.append(caption.get_ref())
3670
+ pic_title = "picture"
3671
+ if chart_type is not None:
3672
+ pic.annotations.append(
3673
+ PictureClassificationData(
3674
+ provenance="load_from_doctags",
3675
+ predicted_classes=[
3676
+ # chart_type
3677
+ PictureClassificationClass(
3678
+ class_name=chart_type, confidence=1.0
3679
+ )
3680
+ ],
3681
+ )
3682
+ )
3683
+ pic_title = chart_type
3684
+ if table_data is not None:
3685
+ # Add chart data as PictureTabularChartData
3686
+ pd = PictureTabularChartData(
3687
+ chart_data=table_data, title=pic_title
3688
+ )
3689
+ pic.annotations.append(pd)
3493
3690
  else:
3494
3691
  if bbox:
3495
3692
  # In case we don't have access to an binary of an image
3496
- self.add_picture(
3693
+ pic = self.add_picture(
3497
3694
  parent=None,
3498
3695
  prov=ProvenanceItem(
3499
3696
  bbox=bbox, charspan=(0, 0), page_no=page_no
3500
3697
  ),
3501
3698
  )
3502
3699
  # If there is a caption to an image, add it as well
3503
- if len(text_caption_content) > 0:
3504
- caption_item = self.add_text(
3505
- label=DocItemLabel.CAPTION,
3506
- text=text_caption_content,
3507
- parent=None,
3700
+ if caption is not None and caption_bbox is not None:
3701
+ caption.prov.append(
3702
+ ProvenanceItem(
3703
+ bbox=caption_bbox.resize_by_scale(
3704
+ pg_width, pg_height
3705
+ ),
3706
+ charspan=(0, len(caption.text)),
3707
+ page_no=page_no,
3708
+ )
3709
+ )
3710
+ pic.captions.append(caption.get_ref())
3711
+ if chart_type is not None:
3712
+ pic.annotations.append(
3713
+ PictureClassificationData(
3714
+ provenance="load_from_doctags",
3715
+ predicted_classes=[
3716
+ # chart_type
3717
+ PictureClassificationClass(
3718
+ class_name=chart_type, confidence=1.0
3719
+ )
3720
+ ],
3721
+ )
3508
3722
  )
3509
- pic.captions.append(caption_item.get_ref())
3723
+ if table_data is not None:
3724
+ # Add chart data as PictureTabularChartData
3725
+ pd = PictureTabularChartData(
3726
+ chart_data=table_data, title=pic_title
3727
+ )
3728
+ pic.annotations.append(pd)
3729
+
3510
3730
  elif tag_name == DocItemLabel.KEY_VALUE_REGION:
3511
3731
  key_value_data, kv_item_prov = parse_key_value_item(
3512
3732
  full_chunk, image