docling-core 2.4.0__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -10,8 +10,9 @@ from pydantic import BaseModel
10
10
  class ImageRefMode(str, Enum):
11
11
  """ImageRefMode."""
12
12
 
13
- PLACEHOLDER = "placeholder"
14
- EMBEDDED = "embedded"
13
+ PLACEHOLDER = "placeholder" # just a place-holder
14
+ EMBEDDED = "embedded" # embed the image as a base64
15
+ REFERENCED = "referenced" # reference the image via uri
15
16
 
16
17
 
17
18
  class CoordOrigin(str, Enum):
@@ -1,15 +1,22 @@
1
1
  """Models for the Docling Document data type."""
2
2
 
3
3
  import base64
4
+ import copy
5
+ import hashlib
6
+ import json
4
7
  import mimetypes
8
+ import os
5
9
  import re
6
10
  import sys
7
11
  import textwrap
8
12
  import typing
9
13
  from io import BytesIO
14
+ from pathlib import Path
10
15
  from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
16
+ from urllib.parse import unquote
11
17
 
12
18
  import pandas as pd
19
+ import yaml
13
20
  from PIL import Image as PILImage
14
21
  from pydantic import (
15
22
  AnyUrl,
@@ -30,6 +37,7 @@ from docling_core.types.doc import BoundingBox, Size
30
37
  from docling_core.types.doc.base import ImageRefMode
31
38
  from docling_core.types.doc.labels import DocItemLabel, GroupLabel
32
39
  from docling_core.types.legacy_doc.tokens import DocumentToken
40
+ from docling_core.utils.file import relative_path
33
41
 
34
42
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
35
43
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
@@ -436,21 +444,25 @@ class ImageRef(BaseModel):
436
444
  mimetype: str
437
445
  dpi: int
438
446
  size: Size
439
- uri: AnyUrl
447
+ uri: Union[AnyUrl, Path]
440
448
  _pil: Optional[PILImage.Image] = None
441
449
 
442
450
  @property
443
- def pil_image(self) -> PILImage.Image:
451
+ def pil_image(self) -> Optional[PILImage.Image]:
444
452
  """Return the PIL Image."""
445
453
  if self._pil is not None:
446
454
  return self._pil
447
455
 
448
- if str(self.uri).startswith("data:"):
449
- encoded_img = str(self.uri).split(",")[1]
450
- decoded_img = base64.b64decode(encoded_img)
451
- self._pil = PILImage.open(BytesIO(decoded_img))
452
- else:
453
- self._pil = PILImage.open(str(self.uri))
456
+ if isinstance(self.uri, AnyUrl):
457
+ if self.uri.scheme == "data":
458
+ encoded_img = str(self.uri).split(",")[1]
459
+ decoded_img = base64.b64decode(encoded_img)
460
+ self._pil = PILImage.open(BytesIO(decoded_img))
461
+ elif self.uri.scheme == "file":
462
+ self._pil = PILImage.open(unquote(str(self.uri.path)))
463
+ # else: Handle http request or other protocols...
464
+ elif isinstance(self.uri, Path):
465
+ self._pil = PILImage.open(self.uri)
454
466
 
455
467
  return self._pil
456
468
 
@@ -566,6 +578,8 @@ class DocItem(
566
578
  return None
567
579
 
568
580
  page_image = page.image.pil_image
581
+ if not page_image:
582
+ return None
569
583
  crop_bbox = (
570
584
  self.prov[0]
571
585
  .bbox.to_top_left_origin(page_height=page.size.height)
@@ -631,6 +645,50 @@ class SectionHeaderItem(TextItem):
631
645
  label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
632
646
  level: LevelNumber
633
647
 
648
+ def export_to_document_tokens(
649
+ self,
650
+ doc: "DoclingDocument",
651
+ new_line: str = "\n",
652
+ xsize: int = 100,
653
+ ysize: int = 100,
654
+ add_location: bool = True,
655
+ add_content: bool = True,
656
+ add_page_index: bool = True,
657
+ ):
658
+ r"""Export text element to document tokens format.
659
+
660
+ :param doc: "DoclingDocument":
661
+ :param new_line: str: (Default value = "\n")
662
+ :param xsize: int: (Default value = 100)
663
+ :param ysize: int: (Default value = 100)
664
+ :param add_location: bool: (Default value = True)
665
+ :param add_content: bool: (Default value = True)
666
+ :param add_page_index: bool: (Default value = True)
667
+
668
+ """
669
+ body = f"<{self.label.value}_level_{self.level}>"
670
+
671
+ # TODO: This must be done through an explicit mapping.
672
+ # assert DocumentToken.is_known_token(
673
+ # body
674
+ # ), f"failed DocumentToken.is_known_token({body})"
675
+
676
+ if add_location:
677
+ body += self.get_location_tokens(
678
+ doc=doc,
679
+ new_line="",
680
+ xsize=xsize,
681
+ ysize=ysize,
682
+ add_page_index=add_page_index,
683
+ )
684
+
685
+ if add_content and self.text is not None:
686
+ body += self.text.strip()
687
+
688
+ body += f"</{self.label.value}_level_{self.level}>{new_line}"
689
+
690
+ return body
691
+
634
692
 
635
693
  class ListItem(TextItem):
636
694
  """SectionItem."""
@@ -677,6 +735,152 @@ class PictureItem(FloatingItem):
677
735
 
678
736
  annotations: List[PictureDataType] = []
679
737
 
738
+ # Convert the image to Base64
739
+ def _image_to_base64(self, pil_image, format="PNG"):
740
+ """Base64 representation of the image."""
741
+ buffered = BytesIO()
742
+ pil_image.save(buffered, format=format) # Save the image to the byte stream
743
+ img_bytes = buffered.getvalue() # Get the byte data
744
+ img_base64 = base64.b64encode(img_bytes).decode(
745
+ "utf-8"
746
+ ) # Encode to Base64 and decode to string
747
+ return img_base64
748
+
749
+ def _image_to_hexhash(self) -> Optional[str]:
750
+ """Hexash from the image."""
751
+ if self.image is not None and self.image._pil is not None:
752
+ # Convert the image to raw bytes
753
+ image_bytes = self.image._pil.tobytes()
754
+
755
+ # Create a hash object (e.g., SHA-256)
756
+ hasher = hashlib.sha256()
757
+
758
+ # Feed the image bytes into the hash object
759
+ hasher.update(image_bytes)
760
+
761
+ # Get the hexadecimal representation of the hash
762
+ return hasher.hexdigest()
763
+
764
+ return None
765
+
766
+ def export_to_markdown(
767
+ self,
768
+ doc: "DoclingDocument",
769
+ add_caption: bool = True,
770
+ image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
771
+ image_placeholder: str = "<!-- image -->",
772
+ ) -> str:
773
+ """Export picture to Markdown format."""
774
+ default_response = "\n" + image_placeholder + "\n"
775
+ error_response = (
776
+ "\n<!-- 🖼️❌ Image not available. "
777
+ "Please use `PdfPipelineOptions(generate_picture_images=True)`"
778
+ " --> \n"
779
+ )
780
+
781
+ if image_mode == ImageRefMode.PLACEHOLDER:
782
+ return default_response
783
+
784
+ elif image_mode == ImageRefMode.EMBEDDED:
785
+
786
+ # short-cut: we already have the image in base64
787
+ if (
788
+ isinstance(self.image, ImageRef)
789
+ and isinstance(self.image.uri, AnyUrl)
790
+ and self.image.uri.scheme == "data"
791
+ ):
792
+ text = f"\n![Image]({self.image.uri})\n"
793
+ return text
794
+
795
+ # get the self.image._pil or crop it out of the page-image
796
+ img = self.get_image(doc)
797
+
798
+ if img is not None:
799
+ imgb64 = self._image_to_base64(img)
800
+ text = f"\n![Image]({imgb64})\n"
801
+
802
+ return text
803
+ else:
804
+ return error_response
805
+
806
+ elif image_mode == ImageRefMode.REFERENCED:
807
+ if not isinstance(self.image, ImageRef) or (
808
+ isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
809
+ ):
810
+ return default_response
811
+
812
+ if (
813
+ isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "file"
814
+ ) or isinstance(self.image.uri, Path):
815
+ text = f"\n![Image]({str(self.image.uri)})\n"
816
+ return text
817
+
818
+ else:
819
+ return default_response
820
+
821
+ else:
822
+ return default_response
823
+
824
+ def export_to_html(
825
+ self,
826
+ doc: "DoclingDocument",
827
+ add_caption: bool = True,
828
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
829
+ ) -> str:
830
+ """Export picture to HTML format."""
831
+ text = ""
832
+ if add_caption and len(self.captions):
833
+ text = self.caption_text(doc)
834
+
835
+ caption_text = ""
836
+ if len(text) > 0:
837
+ caption_text = f"<figcaption>{text}</figcaption>"
838
+
839
+ default_response = f"<figure>{caption_text}</figure>"
840
+
841
+ if image_mode == ImageRefMode.PLACEHOLDER:
842
+ return default_response
843
+
844
+ elif image_mode == ImageRefMode.EMBEDDED:
845
+ # short-cut: we already have the image in base64
846
+ if (
847
+ isinstance(self.image, ImageRef)
848
+ and isinstance(self.image.uri, AnyUrl)
849
+ and self.image.uri.scheme == "data"
850
+ ):
851
+ img_text = f'<img src="{self.image.uri}">'
852
+ return f"<figure>{caption_text}{img_text}</figure>"
853
+
854
+ # get the self.image._pil or crop it out of the page-image
855
+ img = self.get_image(doc)
856
+
857
+ if img is not None:
858
+ imgb64 = self._image_to_base64(img)
859
+ img_text = f'<img src="data:image/png;base64,{imgb64}">'
860
+
861
+ return f"<figure>{caption_text}{img_text}</figure>"
862
+ else:
863
+ return default_response
864
+
865
+ elif image_mode == ImageRefMode.REFERENCED:
866
+
867
+ if not isinstance(self.image, ImageRef) or (
868
+ isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
869
+ ):
870
+ return default_response
871
+
872
+ if (
873
+ isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "file"
874
+ ) or isinstance(self.image.uri, Path):
875
+ img_text = f'<img src="{str(self.image.uri)}">'
876
+ return f"<figure>{caption_text}{img_text}</figure>"
877
+
878
+ else:
879
+ return default_response
880
+
881
+ else:
882
+ return default_response
883
+
680
884
  def export_to_document_tokens(
681
885
  self,
682
886
  doc: "DoclingDocument",
@@ -804,14 +1008,21 @@ class TableItem(FloatingItem):
804
1008
  )
805
1009
  return md_table
806
1010
 
807
- def export_to_html(self) -> str:
1011
+ def export_to_html(self, doc: "DoclingDocument", add_caption: bool = True) -> str:
808
1012
  """Export the table as html."""
809
1013
  body = ""
810
1014
  nrows = self.data.num_rows
811
1015
  ncols = self.data.num_cols
812
1016
 
813
- if not len(self.data.table_cells):
1017
+ text = ""
1018
+ if add_caption and len(self.captions):
1019
+ text = self.caption_text(doc)
1020
+
1021
+ if len(self.data.table_cells) == 0:
814
1022
  return ""
1023
+
1024
+ body = ""
1025
+
815
1026
  for i in range(nrows):
816
1027
  body += "<tr>"
817
1028
  for j in range(ncols):
@@ -844,7 +1055,15 @@ class TableItem(FloatingItem):
844
1055
 
845
1056
  body += f"<{opening_tag}>{content}</{celltag}>"
846
1057
  body += "</tr>"
847
- body = f"<table>{body}</table>"
1058
+
1059
+ if len(text) > 0 and len(body) > 0:
1060
+ body = f"<table><caption>{text}</caption><tbody>{body}</tbody></table>"
1061
+ elif len(text) == 0 and len(body) > 0:
1062
+ body = f"<table><tbody>{body}</tbody></table>"
1063
+ elif len(text) > 0 and len(body) == 0:
1064
+ body = f"<table><caption>{text}</caption></table>"
1065
+ else:
1066
+ body = "<table></table>"
848
1067
 
849
1068
  return body
850
1069
 
@@ -981,6 +1200,23 @@ class PageItem(BaseModel):
981
1200
  class DoclingDocument(BaseModel):
982
1201
  """DoclingDocument."""
983
1202
 
1203
+ _HTML_DEFAULT_HEAD: str = r"""<head>
1204
+ <meta charset="UTF-8">
1205
+ <style>
1206
+ table {
1207
+ border-collapse: separate;
1208
+ /* Maintain separate borders */
1209
+ border-spacing: 5px; /*
1210
+ Space between cells */
1211
+ width: 50%;
1212
+ }
1213
+ th, td {
1214
+ border: 1px solid black;
1215
+ /* Add lines etween cells */
1216
+ padding: 8px; }
1217
+ </style>
1218
+ </head>"""
1219
+
984
1220
  schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
985
1221
  version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
986
1222
  CURRENT_VERSION
@@ -1045,7 +1281,7 @@ class DoclingDocument(BaseModel):
1045
1281
  prov: Optional[ProvenanceItem] = None,
1046
1282
  parent: Optional[GroupItem] = None,
1047
1283
  ):
1048
- """add_paragraph.
1284
+ """add_list_item.
1049
1285
 
1050
1286
  :param label: str:
1051
1287
  :param text: str:
@@ -1088,7 +1324,7 @@ class DoclingDocument(BaseModel):
1088
1324
  prov: Optional[ProvenanceItem] = None,
1089
1325
  parent: Optional[GroupItem] = None,
1090
1326
  ):
1091
- """add_paragraph.
1327
+ """add_text.
1092
1328
 
1093
1329
  :param label: str:
1094
1330
  :param text: str:
@@ -1097,28 +1333,41 @@ class DoclingDocument(BaseModel):
1097
1333
  :param parent: Optional[GroupItem]: (Default value = None)
1098
1334
 
1099
1335
  """
1100
- if not parent:
1101
- parent = self.body
1336
+ # Catch a few cases that are in principle allowed
1337
+ # but that will create confusion down the road
1338
+ if label in [DocItemLabel.TITLE]:
1339
+ return self.add_title(text=text, orig=orig, prov=prov, parent=parent)
1102
1340
 
1103
- if not orig:
1104
- orig = text
1341
+ elif label in [DocItemLabel.LIST_ITEM]:
1342
+ return self.add_list_item(text=text, orig=orig, prov=prov, parent=parent)
1105
1343
 
1106
- text_index = len(self.texts)
1107
- cref = f"#/texts/{text_index}"
1108
- text_item = TextItem(
1109
- label=label,
1110
- text=text,
1111
- orig=orig,
1112
- self_ref=cref,
1113
- parent=parent.get_ref(),
1114
- )
1115
- if prov:
1116
- text_item.prov.append(prov)
1344
+ elif label in [DocItemLabel.SECTION_HEADER]:
1345
+ return self.add_heading(text=text, orig=orig, prov=prov, parent=parent)
1117
1346
 
1118
- self.texts.append(text_item)
1119
- parent.children.append(RefItem(cref=cref))
1347
+ else:
1120
1348
 
1121
- return text_item
1349
+ if not parent:
1350
+ parent = self.body
1351
+
1352
+ if not orig:
1353
+ orig = text
1354
+
1355
+ text_index = len(self.texts)
1356
+ cref = f"#/texts/{text_index}"
1357
+ text_item = TextItem(
1358
+ label=label,
1359
+ text=text,
1360
+ orig=orig,
1361
+ self_ref=cref,
1362
+ parent=parent.get_ref(),
1363
+ )
1364
+ if prov:
1365
+ text_item.prov.append(prov)
1366
+
1367
+ self.texts.append(text_item)
1368
+ parent.children.append(RefItem(cref=cref))
1369
+
1370
+ return text_item
1122
1371
 
1123
1372
  def add_table(
1124
1373
  self,
@@ -1170,7 +1419,6 @@ class DoclingDocument(BaseModel):
1170
1419
  :param RefItem]]: (Default value = None)
1171
1420
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1172
1421
  :param parent: Optional[GroupItem]: (Default value = None)
1173
-
1174
1422
  """
1175
1423
  if not parent:
1176
1424
  parent = self.body
@@ -1195,6 +1443,43 @@ class DoclingDocument(BaseModel):
1195
1443
 
1196
1444
  return fig_item
1197
1445
 
1446
+ def add_title(
1447
+ self,
1448
+ text: str,
1449
+ orig: Optional[str] = None,
1450
+ prov: Optional[ProvenanceItem] = None,
1451
+ parent: Optional[GroupItem] = None,
1452
+ ):
1453
+ """add_title.
1454
+
1455
+ :param text: str:
1456
+ :param orig: Optional[str]: (Default value = None)
1457
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
1458
+ :param parent: Optional[GroupItem]: (Default value = None)
1459
+ """
1460
+ if not parent:
1461
+ parent = self.body
1462
+
1463
+ if not orig:
1464
+ orig = text
1465
+
1466
+ text_index = len(self.texts)
1467
+ cref = f"#/texts/{text_index}"
1468
+ text_item = TextItem(
1469
+ label=DocItemLabel.TITLE,
1470
+ text=text,
1471
+ orig=orig,
1472
+ self_ref=cref,
1473
+ parent=parent.get_ref(),
1474
+ )
1475
+ if prov:
1476
+ text_item.prov.append(prov)
1477
+
1478
+ self.texts.append(text_item)
1479
+ parent.children.append(RefItem(cref=cref))
1480
+
1481
+ return text_item
1482
+
1198
1483
  def add_heading(
1199
1484
  self,
1200
1485
  text: str,
@@ -1211,7 +1496,6 @@ class DoclingDocument(BaseModel):
1211
1496
  :param level: LevelNumber: (Default value = 1)
1212
1497
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1213
1498
  :param parent: Optional[GroupItem]: (Default value = None)
1214
-
1215
1499
  """
1216
1500
  if not parent:
1217
1501
  parent = self.body
@@ -1297,17 +1581,220 @@ class DoclingDocument(BaseModel):
1297
1581
  page_no=page_no,
1298
1582
  )
1299
1583
 
1584
+ def _clear_picture_pil_cache(self):
1585
+ """Clear cache storage of all images."""
1586
+ for item, level in self.iterate_items(with_groups=False):
1587
+ if isinstance(item, PictureItem):
1588
+ if item.image is not None and item.image._pil is not None:
1589
+ item.image._pil.close()
1590
+
1591
+ def _list_images_on_disk(self) -> List[Path]:
1592
+ """List all images on disk."""
1593
+ result: List[Path] = []
1594
+
1595
+ for item, level in self.iterate_items(with_groups=False):
1596
+ if isinstance(item, PictureItem):
1597
+ if item.image is not None:
1598
+ if (
1599
+ isinstance(item.image.uri, AnyUrl)
1600
+ and item.image.uri.scheme == "file"
1601
+ and item.image.uri.path is not None
1602
+ ):
1603
+ local_path = Path(unquote(item.image.uri.path))
1604
+ result.append(local_path)
1605
+ elif isinstance(item.image.uri, Path):
1606
+ result.append(item.image.uri)
1607
+
1608
+ return result
1609
+
1610
+ def _with_embedded_pictures(self) -> "DoclingDocument":
1611
+ """Document with embedded images.
1612
+
1613
+ Creates a copy of this document where all pictures referenced
1614
+ through a file URI are turned into base64 embedded form.
1615
+ """
1616
+ result: DoclingDocument = copy.deepcopy(self)
1617
+
1618
+ for ix, (item, level) in enumerate(result.iterate_items(with_groups=True)):
1619
+ if isinstance(item, PictureItem):
1620
+
1621
+ if item.image is not None:
1622
+ if (
1623
+ isinstance(item.image.uri, AnyUrl)
1624
+ and item.image.uri.scheme == "file"
1625
+ ):
1626
+ assert isinstance(item.image.uri.path, str)
1627
+ tmp_image = PILImage.open(str(unquote(item.image.uri.path)))
1628
+ item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
1629
+
1630
+ elif isinstance(item.image.uri, Path):
1631
+ tmp_image = PILImage.open(str(item.image.uri))
1632
+ item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
1633
+
1634
+ return result
1635
+
1636
+ def _with_pictures_refs(
1637
+ self, image_dir: Path, reference_path: Optional[Path] = None
1638
+ ) -> "DoclingDocument":
1639
+ """Document with images as refs.
1640
+
1641
+ Creates a copy of this document where all picture data is
1642
+ saved to image_dir and referenced through file URIs.
1643
+ """
1644
+ result: DoclingDocument = copy.deepcopy(self)
1645
+
1646
+ img_count = 0
1647
+ image_dir.mkdir(parents=True, exist_ok=True)
1648
+
1649
+ if image_dir.is_dir():
1650
+ for item, level in result.iterate_items(with_groups=False):
1651
+ if isinstance(item, PictureItem):
1652
+
1653
+ if (
1654
+ item.image is not None
1655
+ and isinstance(item.image.uri, AnyUrl)
1656
+ and item.image.uri.scheme == "data"
1657
+ and item.image.pil_image is not None
1658
+ ):
1659
+ img = item.image.pil_image
1660
+
1661
+ hexhash = item._image_to_hexhash()
1662
+
1663
+ # loc_path = image_dir / f"image_{img_count:06}.png"
1664
+ if hexhash is not None:
1665
+ loc_path = image_dir / f"image_{img_count:06}_{hexhash}.png"
1666
+
1667
+ img.save(loc_path)
1668
+ if reference_path is not None:
1669
+ obj_path = relative_path(
1670
+ reference_path.resolve(), loc_path.resolve()
1671
+ )
1672
+ else:
1673
+ obj_path = loc_path
1674
+
1675
+ item.image.uri = Path(obj_path)
1676
+
1677
+ # if item.image._pil is not None:
1678
+ # item.image._pil.close()
1679
+
1680
+ img_count += 1
1681
+
1682
+ return result
1683
+
1300
1684
  def print_element_tree(self):
1301
- """print_element_tree."""
1685
+ """Print_element_tree."""
1302
1686
  for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
1303
1687
  if isinstance(item, GroupItem):
1304
1688
  print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
1305
1689
  elif isinstance(item, DocItem):
1306
1690
  print(" " * level, f"{ix}: {item.label.value}")
1307
1691
 
1308
- def export_to_dict(self) -> Dict:
1309
- """export_to_dict."""
1310
- return self.model_dump(mode="json", by_alias=True, exclude_none=True)
1692
+ def export_to_element_tree(self) -> str:
1693
+ """Export_to_element_tree."""
1694
+ texts = []
1695
+ for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
1696
+ if isinstance(item, GroupItem):
1697
+ texts.append(
1698
+ " " * level + f"{ix}: {item.label.value} with name={item.name}"
1699
+ )
1700
+ elif isinstance(item, DocItem):
1701
+ texts.append(" " * level + f"{ix}: {item.label.value}")
1702
+
1703
+ return "\n".join(texts)
1704
+
1705
+ def save_as_json(
1706
+ self,
1707
+ filename: Path,
1708
+ artifacts_dir: Optional[Path] = None,
1709
+ image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
1710
+ indent: int = 2,
1711
+ ):
1712
+ """Save as json."""
1713
+ artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
1714
+
1715
+ if image_mode == ImageRefMode.REFERENCED:
1716
+ os.makedirs(artifacts_dir, exist_ok=True)
1717
+
1718
+ new_doc = self._make_copy_with_refmode(
1719
+ artifacts_dir, image_mode, reference_path=reference_path
1720
+ )
1721
+
1722
+ out = new_doc.export_to_dict()
1723
+ with open(filename, "w") as fw:
1724
+ json.dump(out, fw, indent=indent)
1725
+
1726
+ def save_as_yaml(
1727
+ self,
1728
+ filename: Path,
1729
+ artifacts_dir: Optional[Path] = None,
1730
+ image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
1731
+ default_flow_style: bool = False,
1732
+ ):
1733
+ """Save as yaml."""
1734
+ artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
1735
+
1736
+ if image_mode == ImageRefMode.REFERENCED:
1737
+ os.makedirs(artifacts_dir, exist_ok=True)
1738
+
1739
+ new_doc = self._make_copy_with_refmode(
1740
+ artifacts_dir, image_mode, reference_path=reference_path
1741
+ )
1742
+
1743
+ out = new_doc.export_to_dict()
1744
+ with open(filename, "w") as fw:
1745
+ yaml.dump(out, fw, default_flow_style=default_flow_style)
1746
+
1747
+ def export_to_dict(
1748
+ self,
1749
+ mode: str = "json",
1750
+ by_alias: bool = True,
1751
+ exclude_none: bool = True,
1752
+ ) -> Dict:
1753
+ """Export to dict."""
1754
+ out = self.model_dump(mode=mode, by_alias=by_alias, exclude_none=exclude_none)
1755
+
1756
+ return out
1757
+
1758
+ def save_as_markdown(
1759
+ self,
1760
+ filename: Path,
1761
+ artifacts_dir: Optional[Path] = None,
1762
+ delim: str = "\n",
1763
+ from_element: int = 0,
1764
+ to_element: int = sys.maxsize,
1765
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1766
+ strict_text: bool = False,
1767
+ image_placeholder: str = "<!-- image -->",
1768
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
1769
+ indent: int = 4,
1770
+ text_width: int = -1,
1771
+ page_no: Optional[int] = None,
1772
+ ):
1773
+ """Save to markdown."""
1774
+ artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
1775
+
1776
+ if image_mode == ImageRefMode.REFERENCED:
1777
+ os.makedirs(artifacts_dir, exist_ok=True)
1778
+
1779
+ new_doc = self._make_copy_with_refmode(
1780
+ artifacts_dir, image_mode, reference_path=reference_path
1781
+ )
1782
+
1783
+ md_out = new_doc.export_to_markdown(
1784
+ delim=delim,
1785
+ from_element=from_element,
1786
+ to_element=to_element,
1787
+ labels=labels,
1788
+ strict_text=strict_text,
1789
+ image_placeholder=image_placeholder,
1790
+ image_mode=image_mode,
1791
+ indent=indent,
1792
+ text_width=text_width,
1793
+ page_no=page_no,
1794
+ )
1795
+
1796
+ with open(filename, "w") as fw:
1797
+ fw.write(md_out)
1311
1798
 
1312
1799
  def export_to_markdown( # noqa: C901
1313
1800
  self,
@@ -1461,22 +1948,13 @@ class DoclingDocument(BaseModel):
1461
1948
  in_list = False
1462
1949
  mdtexts.append(item.caption_text(self))
1463
1950
 
1464
- if image_mode == ImageRefMode.PLACEHOLDER:
1465
- mdtexts.append("\n" + image_placeholder + "\n")
1466
- elif image_mode == ImageRefMode.EMBEDDED and isinstance(
1467
- item.image, ImageRef
1468
- ):
1469
- text = f"![Local Image]({item.image.uri})\n"
1470
- mdtexts.append(text)
1471
- elif image_mode == ImageRefMode.EMBEDDED and not isinstance(
1472
- item.image, ImageRef
1473
- ):
1474
- text = (
1475
- "<!-- 🖼️❌ Image not available. "
1476
- "Please use `PdfPipelineOptions(generate_picture_images=True)`"
1477
- " --> "
1478
- )
1479
- mdtexts.append(text)
1951
+ line = item.export_to_markdown(
1952
+ doc=self,
1953
+ image_placeholder=image_placeholder,
1954
+ image_mode=image_mode,
1955
+ )
1956
+
1957
+ mdtexts.append(line)
1480
1958
 
1481
1959
  elif isinstance(item, DocItem) and item.label in labels:
1482
1960
  in_list = False
@@ -1518,11 +1996,288 @@ class DoclingDocument(BaseModel):
1518
1996
  image_placeholder="",
1519
1997
  )
1520
1998
 
1521
- def export_to_document_tokens(
1999
+ def save_as_html(
2000
+ self,
2001
+ filename: Path,
2002
+ artifacts_dir: Optional[Path] = None,
2003
+ from_element: int = 0,
2004
+ to_element: int = sys.maxsize,
2005
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2006
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2007
+ page_no: Optional[int] = None,
2008
+ html_lang: str = "en",
2009
+ html_head: str = _HTML_DEFAULT_HEAD,
2010
+ ):
2011
+ """Save to HTML."""
2012
+ artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
2013
+
2014
+ if image_mode == ImageRefMode.REFERENCED:
2015
+ os.makedirs(artifacts_dir, exist_ok=True)
2016
+
2017
+ new_doc = self._make_copy_with_refmode(
2018
+ artifacts_dir, image_mode, reference_path=reference_path
2019
+ )
2020
+
2021
+ html_out = new_doc.export_to_html(
2022
+ from_element=from_element,
2023
+ to_element=to_element,
2024
+ labels=labels,
2025
+ image_mode=image_mode,
2026
+ page_no=page_no,
2027
+ html_lang=html_lang,
2028
+ html_head=html_head,
2029
+ )
2030
+
2031
+ with open(filename, "w") as fw:
2032
+ fw.write(html_out)
2033
+
2034
+ def _get_output_paths(
2035
+ self, filename: Path, artifacts_dir: Optional[Path] = None
2036
+ ) -> Tuple[Path, Optional[Path]]:
2037
+ if artifacts_dir is None:
2038
+ # Remove the extension and add '_pictures'
2039
+ artifacts_dir = filename.with_suffix("")
2040
+ artifacts_dir = artifacts_dir.with_name(artifacts_dir.stem + "_artifacts")
2041
+ if artifacts_dir.is_absolute():
2042
+ reference_path = None
2043
+ else:
2044
+ reference_path = filename.parent
2045
+ return artifacts_dir, reference_path
2046
+
2047
+ def _make_copy_with_refmode(
2048
+ self,
2049
+ artifacts_dir: Path,
2050
+ image_mode: ImageRefMode,
2051
+ reference_path: Optional[Path] = None,
2052
+ ):
2053
+ new_doc = None
2054
+ if image_mode == ImageRefMode.PLACEHOLDER:
2055
+ new_doc = self
2056
+ elif image_mode == ImageRefMode.REFERENCED:
2057
+ new_doc = self._with_pictures_refs(
2058
+ image_dir=artifacts_dir, reference_path=reference_path
2059
+ )
2060
+ elif image_mode == ImageRefMode.EMBEDDED:
2061
+ new_doc = self._with_embedded_pictures()
2062
+ else:
2063
+ raise ValueError("Unsupported ImageRefMode")
2064
+ return new_doc
2065
+
2066
+ def export_to_html( # noqa: C901
2067
+ self,
2068
+ from_element: int = 0,
2069
+ to_element: int = sys.maxsize,
2070
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2071
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2072
+ page_no: Optional[int] = None,
2073
+ html_lang: str = "en",
2074
+ html_head: str = _HTML_DEFAULT_HEAD,
2075
+ ) -> str:
2076
+ r"""Serialize to HTML."""
2077
+
2078
+ def close_lists(
2079
+ curr_level: int,
2080
+ prev_level: int,
2081
+ in_ordered_list: List[bool],
2082
+ html_texts: list[str],
2083
+ ):
2084
+
2085
+ if len(in_ordered_list) == 0:
2086
+ return (in_ordered_list, html_texts)
2087
+
2088
+ while curr_level < prev_level and len(in_ordered_list) > 0:
2089
+ if in_ordered_list[-1]:
2090
+ html_texts.append("</ol>")
2091
+ else:
2092
+ html_texts.append("</ul>")
2093
+
2094
+ prev_level -= 1
2095
+ in_ordered_list.pop() # = in_ordered_list[:-1]
2096
+
2097
+ return (in_ordered_list, html_texts)
2098
+
2099
+ head_lines = ["<!DOCTYPE html>", f'<html lang="{html_lang}">', html_head]
2100
+ html_texts: list[str] = []
2101
+
2102
+ prev_level = 0 # Track the previous item's level
2103
+
2104
+ in_ordered_list: List[bool] = [] # False
2105
+
2106
+ for ix, (item, curr_level) in enumerate(
2107
+ self.iterate_items(self.body, with_groups=True, page_no=page_no)
2108
+ ):
2109
+ # If we've moved to a lower level, we're exiting one or more groups
2110
+ if curr_level < prev_level and len(in_ordered_list) > 0:
2111
+ # Calculate how many levels we've exited
2112
+ # level_difference = previous_level - level
2113
+ # Decrement list_nesting_level for each list group we've exited
2114
+ # list_nesting_level = max(0, list_nesting_level - level_difference)
2115
+
2116
+ in_ordered_list, html_texts = close_lists(
2117
+ curr_level=curr_level,
2118
+ prev_level=prev_level,
2119
+ in_ordered_list=in_ordered_list,
2120
+ html_texts=html_texts,
2121
+ )
2122
+
2123
+ prev_level = curr_level # Update previous_level for next iteration
2124
+
2125
+ if ix < from_element or to_element <= ix:
2126
+ continue # skip as many items as you want
2127
+
2128
+ if (isinstance(item, DocItem)) and (item.label not in labels):
2129
+ continue # skip any label that is not whitelisted
2130
+
2131
+ if isinstance(item, GroupItem) and item.label in [
2132
+ GroupLabel.ORDERED_LIST,
2133
+ ]:
2134
+
2135
+ text = "<ol>"
2136
+ html_texts.append(text.strip())
2137
+
2138
+ # Increment list nesting level when entering a new list
2139
+ in_ordered_list.append(True)
2140
+
2141
+ elif isinstance(item, GroupItem) and item.label in [
2142
+ GroupLabel.LIST,
2143
+ ]:
2144
+
2145
+ text = "<ul>"
2146
+ html_texts.append(text.strip())
2147
+
2148
+ # Increment list nesting level when entering a new list
2149
+ in_ordered_list.append(False)
2150
+
2151
+ elif isinstance(item, GroupItem):
2152
+ continue
2153
+
2154
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2155
+
2156
+ text = f"<h1>{item.text}</h1>"
2157
+ html_texts.append(text.strip())
2158
+
2159
+ elif isinstance(item, SectionHeaderItem):
2160
+
2161
+ section_level: int = item.level + 1
2162
+
2163
+ text = f"<h{(section_level)}>{item.text}</h{(section_level)}>"
2164
+ html_texts.append(text.strip())
2165
+
2166
+ elif isinstance(item, TextItem) and item.label in [
2167
+ DocItemLabel.SECTION_HEADER
2168
+ ]:
2169
+
2170
+ section_level = curr_level
2171
+
2172
+ if section_level <= 1:
2173
+ section_level = 2
2174
+
2175
+ if section_level >= 6:
2176
+ section_level = 6
2177
+
2178
+ text = f"<h{section_level}>{item.text}</h{section_level}>"
2179
+ html_texts.append(text.strip())
2180
+
2181
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
2182
+
2183
+ text = f"<pre>{item.text}</pre>"
2184
+ html_texts.append(text)
2185
+
2186
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
2187
+ # captions are printed in picture and table ... skipping for now
2188
+ continue
2189
+
2190
+ elif isinstance(item, ListItem):
2191
+
2192
+ text = f"<li>{item.text}</li>"
2193
+ html_texts.append(text)
2194
+
2195
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
2196
+
2197
+ text = f"<li>{item.text}</li>"
2198
+ html_texts.append(text)
2199
+
2200
+ elif isinstance(item, TextItem) and item.label in labels:
2201
+
2202
+ text = f"<p>{item.text}</p>"
2203
+ html_texts.append(text.strip())
2204
+
2205
+ elif isinstance(item, TableItem):
2206
+
2207
+ text = item.export_to_html(doc=self, add_caption=True)
2208
+ html_texts.append(text)
2209
+
2210
+ elif isinstance(item, PictureItem):
2211
+
2212
+ html_texts.append(
2213
+ item.export_to_html(
2214
+ doc=self, add_caption=True, image_mode=image_mode
2215
+ )
2216
+ )
2217
+
2218
+ elif isinstance(item, DocItem) and item.label in labels:
2219
+ continue
2220
+
2221
+ html_texts.append("</html>")
2222
+
2223
+ lines = []
2224
+ lines.extend(head_lines)
2225
+ for i, line in enumerate(html_texts):
2226
+ lines.append(line.replace("\n", "<br>"))
2227
+
2228
+ delim = "\n"
2229
+ html_text = (delim.join(lines)).strip()
2230
+
2231
+ return html_text
2232
+
2233
+ def save_as_document_tokens(
1522
2234
  self,
2235
+ filename: Path,
1523
2236
  delim: str = "\n\n",
1524
2237
  from_element: int = 0,
1525
- to_element: Optional[int] = None,
2238
+ to_element: int = sys.maxsize,
2239
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2240
+ xsize: int = 100,
2241
+ ysize: int = 100,
2242
+ add_location: bool = True,
2243
+ add_content: bool = True,
2244
+ add_page_index: bool = True,
2245
+ # table specific flags
2246
+ add_table_cell_location: bool = False,
2247
+ add_table_cell_label: bool = True,
2248
+ add_table_cell_text: bool = True,
2249
+ # specifics
2250
+ page_no: Optional[int] = None,
2251
+ with_groups: bool = True,
2252
+ ):
2253
+ r"""Save the document content to a DocumentToken format."""
2254
+ out = self.export_to_document_tokens(
2255
+ delim=delim,
2256
+ from_element=from_element,
2257
+ to_element=to_element,
2258
+ labels=labels,
2259
+ xsize=xsize,
2260
+ ysize=ysize,
2261
+ add_location=add_location,
2262
+ add_content=add_content,
2263
+ add_page_index=add_page_index,
2264
+ # table specific flags
2265
+ add_table_cell_location=add_table_cell_location,
2266
+ add_table_cell_label=add_table_cell_label,
2267
+ add_table_cell_text=add_table_cell_text,
2268
+ # specifics
2269
+ page_no=page_no,
2270
+ with_groups=with_groups,
2271
+ )
2272
+
2273
+ with open(filename, "w") as fw:
2274
+ fw.write(out)
2275
+
2276
+ def export_to_document_tokens(
2277
+ self,
2278
+ delim: str = "\n",
2279
+ from_element: int = 0,
2280
+ to_element: int = sys.maxsize,
1526
2281
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1527
2282
  xsize: int = 100,
1528
2283
  ysize: int = 100,
@@ -1533,8 +2288,12 @@ class DoclingDocument(BaseModel):
1533
2288
  add_table_cell_location: bool = False,
1534
2289
  add_table_cell_label: bool = True,
1535
2290
  add_table_cell_text: bool = True,
2291
+ # specifics
2292
+ page_no: Optional[int] = None,
2293
+ with_groups: bool = True,
2294
+ newline: bool = True,
1536
2295
  ) -> str:
1537
- r"""Exports the document content to an DocumentToken format.
2296
+ r"""Exports the document content to a DocumentToken format.
1538
2297
 
1539
2298
  Operates on a slice of the document's body as defined through arguments
1540
2299
  from_element and to_element; defaulting to the whole main_text.
@@ -1554,44 +2313,102 @@ class DoclingDocument(BaseModel):
1554
2313
  :returns: The content of the document formatted as a DocTags string.
1555
2314
  :rtype: str
1556
2315
  """
1557
- new_line = ""
1558
- if delim:
1559
- new_line = "\n"
1560
2316
 
1561
- doctags = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
2317
+ def close_lists(
2318
+ curr_level: int,
2319
+ prev_level: int,
2320
+ in_ordered_list: List[bool],
2321
+ result: str,
2322
+ delim: str,
2323
+ ):
2324
+
2325
+ if len(in_ordered_list) == 0:
2326
+ return (in_ordered_list, result)
2327
+
2328
+ while curr_level < prev_level and len(in_ordered_list) > 0:
2329
+ if in_ordered_list[-1]:
2330
+ result += f"</ordered_list>{delim}"
2331
+ else:
2332
+ result += f"</unordered_list>{delim}"
1562
2333
 
1563
- # pagedims = self.get_map_to_page_dimensions()
2334
+ prev_level -= 1
2335
+ in_ordered_list.pop() # = in_ordered_list[:-1]
2336
+
2337
+ return (in_ordered_list, result)
2338
+
2339
+ if newline:
2340
+ delim = "\n"
2341
+ else:
2342
+ delim = ""
1564
2343
 
1565
- skip_count = 0
1566
- for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1567
- if skip_count < from_element:
1568
- skip_count += 1
2344
+ prev_level = 0 # Track the previous item's level
2345
+
2346
+ in_ordered_list: List[bool] = [] # False
2347
+
2348
+ result = f"{DocumentToken.BEG_DOCUMENT.value}{delim}"
2349
+
2350
+ for ix, (item, curr_level) in enumerate(
2351
+ self.iterate_items(self.body, with_groups=True)
2352
+ ):
2353
+
2354
+ # If we've moved to a lower level, we're exiting one or more groups
2355
+ if curr_level < prev_level and len(in_ordered_list) > 0:
2356
+ # Calculate how many levels we've exited
2357
+ # level_difference = previous_level - level
2358
+ # Decrement list_nesting_level for each list group we've exited
2359
+ # list_nesting_level = max(0, list_nesting_level - level_difference)
2360
+
2361
+ in_ordered_list, result = close_lists(
2362
+ curr_level=curr_level,
2363
+ prev_level=prev_level,
2364
+ in_ordered_list=in_ordered_list,
2365
+ result=result,
2366
+ delim=delim,
2367
+ )
2368
+
2369
+ prev_level = curr_level # Update previous_level for next iteration
2370
+
2371
+ if ix < from_element or to_element <= ix:
1569
2372
  continue # skip as many items as you want
1570
2373
 
1571
- if to_element and ix >= to_element:
1572
- break
2374
+ if (isinstance(item, DocItem)) and (item.label not in labels):
2375
+ continue # skip any label that is not whitelisted
1573
2376
 
1574
- if not isinstance(item, DocItem):
1575
- continue
2377
+ if isinstance(item, GroupItem) and item.label in [
2378
+ GroupLabel.ORDERED_LIST,
2379
+ ]:
1576
2380
 
1577
- prov = item.prov
2381
+ result += f"<ordered_list>{delim}"
2382
+ in_ordered_list.append(True)
1578
2383
 
1579
- page_i = -1
2384
+ elif isinstance(item, GroupItem) and item.label in [
2385
+ GroupLabel.LIST,
2386
+ ]:
1580
2387
 
1581
- if add_location and len(self.pages) and len(prov) > 0:
2388
+ result += f"<unordered_list>{delim}"
2389
+ in_ordered_list.append(False)
1582
2390
 
1583
- page_i = prov[0].page_no
1584
- page_dim = self.pages[page_i].size
2391
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
2392
+ # captions are printed in picture and table ... skipping for now
2393
+ continue
1585
2394
 
1586
- float(page_dim.width)
1587
- float(page_dim.height)
2395
+ elif isinstance(item, SectionHeaderItem):
2396
+
2397
+ result += item.export_to_document_tokens(
2398
+ doc=self,
2399
+ new_line=delim,
2400
+ xsize=xsize,
2401
+ ysize=ysize,
2402
+ add_location=add_location,
2403
+ add_content=add_content,
2404
+ add_page_index=add_page_index,
2405
+ )
1588
2406
 
1589
- item_type = item.label
1590
- if isinstance(item, TextItem) and (item_type in labels):
2407
+ elif isinstance(item, TextItem) and (item.label in labels):
1591
2408
 
1592
- doctags += item.export_to_document_tokens(
2409
+ result += item.export_to_document_tokens(
1593
2410
  doc=self,
1594
- new_line=new_line,
2411
+ new_line=delim,
1595
2412
  xsize=xsize,
1596
2413
  ysize=ysize,
1597
2414
  add_location=add_location,
@@ -1599,11 +2416,11 @@ class DoclingDocument(BaseModel):
1599
2416
  add_page_index=add_page_index,
1600
2417
  )
1601
2418
 
1602
- elif isinstance(item, TableItem) and (item_type in labels):
2419
+ elif isinstance(item, TableItem) and (item.label in labels):
1603
2420
 
1604
- doctags += item.export_to_document_tokens(
2421
+ result += item.export_to_document_tokens(
1605
2422
  doc=self,
1606
- new_line=new_line,
2423
+ new_line=delim,
1607
2424
  xsize=xsize,
1608
2425
  ysize=ysize,
1609
2426
  add_caption=True,
@@ -1615,11 +2432,11 @@ class DoclingDocument(BaseModel):
1615
2432
  add_page_index=add_page_index,
1616
2433
  )
1617
2434
 
1618
- elif isinstance(item, PictureItem) and (item_type in labels):
2435
+ elif isinstance(item, PictureItem) and (item.label in labels):
1619
2436
 
1620
- doctags += item.export_to_document_tokens(
2437
+ result += item.export_to_document_tokens(
1621
2438
  doc=self,
1622
- new_line=new_line,
2439
+ new_line=delim,
1623
2440
  xsize=xsize,
1624
2441
  ysize=ysize,
1625
2442
  add_caption=True,
@@ -1628,9 +2445,9 @@ class DoclingDocument(BaseModel):
1628
2445
  add_page_index=add_page_index,
1629
2446
  )
1630
2447
 
1631
- doctags += DocumentToken.END_DOCUMENT.value
2448
+ result += DocumentToken.END_DOCUMENT.value
1632
2449
 
1633
- return doctags
2450
+ return result
1634
2451
 
1635
2452
  def _export_to_indented_text(
1636
2453
  self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
@@ -65,3 +65,43 @@ def resolve_file_source(
65
65
  except ValidationError:
66
66
  raise ValueError(f"Unexpected source type encountered: {type(source)}")
67
67
  return local_path
68
+
69
+
70
+ def relative_path(src: Path, target: Path) -> Path:
71
+ """Compute the relative path from `src` to `target`.
72
+
73
+ Args:
74
+ src (str | Path): The source directory or file path (must be absolute).
75
+ target (str | Path): The target directory or file path (must be absolute).
76
+
77
+ Returns:
78
+ Path: The relative path from `src` to `target`.
79
+
80
+ Raises:
81
+ ValueError: If either `src` or `target` is not an absolute path.
82
+ """
83
+ src = Path(src).resolve()
84
+ target = Path(target).resolve()
85
+
86
+ # Ensure both paths are absolute
87
+ if not src.is_absolute():
88
+ raise ValueError(f"The source path must be absolute: {src}")
89
+ if not target.is_absolute():
90
+ raise ValueError(f"The target path must be absolute: {target}")
91
+
92
+ # Find the common ancestor
93
+ common_parts = []
94
+ for src_part, target_part in zip(src.parts, target.parts):
95
+ if src_part == target_part:
96
+ common_parts.append(src_part)
97
+ else:
98
+ break
99
+
100
+ # Determine the path to go up from src to the common ancestor
101
+ up_segments = [".."] * (len(src.parts) - len(common_parts))
102
+
103
+ # Add the path from the common ancestor to the target
104
+ down_segments = target.parts[len(common_parts) :]
105
+
106
+ # Combine and return the result
107
+ return Path(*up_segments, *down_segments)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.4.0
3
+ Version: 2.5.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -29,7 +29,8 @@ Requires-Dist: jsonref (>=1.1.0,<2.0.0)
29
29
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
30
30
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
31
31
  Requires-Dist: pillow (>=10.3.0,<11.0.0)
32
- Requires-Dist: pydantic (>=2.6.0,<3.0.0)
32
+ Requires-Dist: pydantic (>=2.6.0,<2.10)
33
+ Requires-Dist: pyyaml (>=5.1,<7.0.0)
33
34
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
34
35
  Project-URL: Repository, https://github.com/DS4SD/docling-core
35
36
  Description-Content-Type: text/markdown
@@ -20,8 +20,8 @@ docling_core/transforms/chunker/hierarchical_chunker.py,sha256=V4FiOYqL0GgBqVB7x
20
20
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
21
21
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
22
22
  docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
23
- docling_core/types/doc/base.py,sha256=zvx631U_yQCcJam83hNdDanXEYnO3eN-CCw9vDr6S-I,4442
24
- docling_core/types/doc/document.py,sha256=6KeHY4yl4Ry5nT6wacb8ujJ5LnyEZohXG5MAGhoPWGY,57771
23
+ docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
24
+ docling_core/types/doc/document.py,sha256=05q8KZ64TVpxJoegPy7MOlvI0fmqUtKRKZMGsdvUz9c,85711
25
25
  docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
26
26
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
27
27
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
@@ -44,13 +44,13 @@ docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiK
44
44
  docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
45
45
  docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
46
46
  docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
47
- docling_core/utils/file.py,sha256=rZ3kaIpX2ZGxtaSXtqjcrivtXvsbeUolLXT-nntQ5yE,2388
47
+ docling_core/utils/file.py,sha256=ug4-z0KuthkEb_d5YDRPbY79PWfNSj9GYsi16xF2sDA,3699
48
48
  docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
49
49
  docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
50
50
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
51
51
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
52
- docling_core-2.4.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
53
- docling_core-2.4.0.dist-info/METADATA,sha256=fXFVK6Ey5DC15uSYgMixUmGxH6hEM-Kx06tK7jvW2IA,5432
54
- docling_core-2.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
- docling_core-2.4.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
56
- docling_core-2.4.0.dist-info/RECORD,,
52
+ docling_core-2.5.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
53
+ docling_core-2.5.0.dist-info/METADATA,sha256=u4KdNbLkumFHT5HFI7XZo9AUeYryHHkH8iYpDDInA7Q,5468
54
+ docling_core-2.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
+ docling_core-2.5.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
56
+ docling_core-2.5.0.dist-info/RECORD,,