docling-core 2.4.1__py3-none-any.whl → 2.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -10,8 +10,9 @@ from pydantic import BaseModel
10
10
  class ImageRefMode(str, Enum):
11
11
  """ImageRefMode."""
12
12
 
13
- PLACEHOLDER = "placeholder"
14
- EMBEDDED = "embedded"
13
+ PLACEHOLDER = "placeholder" # just a place-holder
14
+ EMBEDDED = "embedded" # embed the image as a base64
15
+ REFERENCED = "referenced" # reference the image via uri
15
16
 
16
17
 
17
18
  class CoordOrigin(str, Enum):
@@ -1,15 +1,23 @@
1
1
  """Models for the Docling Document data type."""
2
2
 
3
3
  import base64
4
+ import copy
5
+ import hashlib
6
+ import json
4
7
  import mimetypes
8
+ import os
5
9
  import re
6
10
  import sys
7
11
  import textwrap
8
12
  import typing
13
+ import warnings
9
14
  from io import BytesIO
15
+ from pathlib import Path
10
16
  from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
17
+ from urllib.parse import unquote
11
18
 
12
19
  import pandas as pd
20
+ import yaml
13
21
  from PIL import Image as PILImage
14
22
  from pydantic import (
15
23
  AnyUrl,
@@ -30,6 +38,7 @@ from docling_core.types.doc import BoundingBox, Size
30
38
  from docling_core.types.doc.base import ImageRefMode
31
39
  from docling_core.types.doc.labels import DocItemLabel, GroupLabel
32
40
  from docling_core.types.legacy_doc.tokens import DocumentToken
41
+ from docling_core.utils.file import relative_path
33
42
 
34
43
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
35
44
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
@@ -436,21 +445,25 @@ class ImageRef(BaseModel):
436
445
  mimetype: str
437
446
  dpi: int
438
447
  size: Size
439
- uri: AnyUrl
448
+ uri: Union[AnyUrl, Path]
440
449
  _pil: Optional[PILImage.Image] = None
441
450
 
442
451
  @property
443
- def pil_image(self) -> PILImage.Image:
452
+ def pil_image(self) -> Optional[PILImage.Image]:
444
453
  """Return the PIL Image."""
445
454
  if self._pil is not None:
446
455
  return self._pil
447
456
 
448
- if str(self.uri).startswith("data:"):
449
- encoded_img = str(self.uri).split(",")[1]
450
- decoded_img = base64.b64decode(encoded_img)
451
- self._pil = PILImage.open(BytesIO(decoded_img))
452
- else:
453
- self._pil = PILImage.open(str(self.uri))
457
+ if isinstance(self.uri, AnyUrl):
458
+ if self.uri.scheme == "data":
459
+ encoded_img = str(self.uri).split(",")[1]
460
+ decoded_img = base64.b64decode(encoded_img)
461
+ self._pil = PILImage.open(BytesIO(decoded_img))
462
+ elif self.uri.scheme == "file":
463
+ self._pil = PILImage.open(unquote(str(self.uri.path)))
464
+ # else: Handle http request or other protocols...
465
+ elif isinstance(self.uri, Path):
466
+ self._pil = PILImage.open(self.uri)
454
467
 
455
468
  return self._pil
456
469
 
@@ -566,6 +579,8 @@ class DocItem(
566
579
  return None
567
580
 
568
581
  page_image = page.image.pil_image
582
+ if not page_image:
583
+ return None
569
584
  crop_bbox = (
570
585
  self.prov[0]
571
586
  .bbox.to_top_left_origin(page_height=page.size.height)
@@ -631,6 +646,50 @@ class SectionHeaderItem(TextItem):
631
646
  label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
632
647
  level: LevelNumber
633
648
 
649
+ def export_to_document_tokens(
650
+ self,
651
+ doc: "DoclingDocument",
652
+ new_line: str = "\n",
653
+ xsize: int = 100,
654
+ ysize: int = 100,
655
+ add_location: bool = True,
656
+ add_content: bool = True,
657
+ add_page_index: bool = True,
658
+ ):
659
+ r"""Export text element to document tokens format.
660
+
661
+ :param doc: "DoclingDocument":
662
+ :param new_line: str: (Default value = "\n")
663
+ :param xsize: int: (Default value = 100)
664
+ :param ysize: int: (Default value = 100)
665
+ :param add_location: bool: (Default value = True)
666
+ :param add_content: bool: (Default value = True)
667
+ :param add_page_index: bool: (Default value = True)
668
+
669
+ """
670
+ body = f"<{self.label.value}_level_{self.level}>"
671
+
672
+ # TODO: This must be done through an explicit mapping.
673
+ # assert DocumentToken.is_known_token(
674
+ # body
675
+ # ), f"failed DocumentToken.is_known_token({body})"
676
+
677
+ if add_location:
678
+ body += self.get_location_tokens(
679
+ doc=doc,
680
+ new_line="",
681
+ xsize=xsize,
682
+ ysize=ysize,
683
+ add_page_index=add_page_index,
684
+ )
685
+
686
+ if add_content and self.text is not None:
687
+ body += self.text.strip()
688
+
689
+ body += f"</{self.label.value}_level_{self.level}>{new_line}"
690
+
691
+ return body
692
+
634
693
 
635
694
  class ListItem(TextItem):
636
695
  """SectionItem."""
@@ -677,6 +736,152 @@ class PictureItem(FloatingItem):
677
736
 
678
737
  annotations: List[PictureDataType] = []
679
738
 
739
+ # Convert the image to Base64
740
+ def _image_to_base64(self, pil_image, format="PNG"):
741
+ """Base64 representation of the image."""
742
+ buffered = BytesIO()
743
+ pil_image.save(buffered, format=format) # Save the image to the byte stream
744
+ img_bytes = buffered.getvalue() # Get the byte data
745
+ img_base64 = base64.b64encode(img_bytes).decode(
746
+ "utf-8"
747
+ ) # Encode to Base64 and decode to string
748
+ return img_base64
749
+
750
+ def _image_to_hexhash(self) -> Optional[str]:
751
+ """Hexash from the image."""
752
+ if self.image is not None and self.image._pil is not None:
753
+ # Convert the image to raw bytes
754
+ image_bytes = self.image._pil.tobytes()
755
+
756
+ # Create a hash object (e.g., SHA-256)
757
+ hasher = hashlib.sha256()
758
+
759
+ # Feed the image bytes into the hash object
760
+ hasher.update(image_bytes)
761
+
762
+ # Get the hexadecimal representation of the hash
763
+ return hasher.hexdigest()
764
+
765
+ return None
766
+
767
+ def export_to_markdown(
768
+ self,
769
+ doc: "DoclingDocument",
770
+ add_caption: bool = True,
771
+ image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
772
+ image_placeholder: str = "<!-- image -->",
773
+ ) -> str:
774
+ """Export picture to Markdown format."""
775
+ default_response = "\n" + image_placeholder + "\n"
776
+ error_response = (
777
+ "\n<!-- 🖼️❌ Image not available. "
778
+ "Please use `PdfPipelineOptions(generate_picture_images=True)`"
779
+ " --> \n"
780
+ )
781
+
782
+ if image_mode == ImageRefMode.PLACEHOLDER:
783
+ return default_response
784
+
785
+ elif image_mode == ImageRefMode.EMBEDDED:
786
+
787
+ # short-cut: we already have the image in base64
788
+ if (
789
+ isinstance(self.image, ImageRef)
790
+ and isinstance(self.image.uri, AnyUrl)
791
+ and self.image.uri.scheme == "data"
792
+ ):
793
+ text = f"\n![Image]({self.image.uri})\n"
794
+ return text
795
+
796
+ # get the self.image._pil or crop it out of the page-image
797
+ img = self.get_image(doc)
798
+
799
+ if img is not None:
800
+ imgb64 = self._image_to_base64(img)
801
+ text = f"\n![Image]({imgb64})\n"
802
+
803
+ return text
804
+ else:
805
+ return error_response
806
+
807
+ elif image_mode == ImageRefMode.REFERENCED:
808
+ if not isinstance(self.image, ImageRef) or (
809
+ isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
810
+ ):
811
+ return default_response
812
+
813
+ if (
814
+ isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "file"
815
+ ) or isinstance(self.image.uri, Path):
816
+ text = f"\n![Image]({str(self.image.uri)})\n"
817
+ return text
818
+
819
+ else:
820
+ return default_response
821
+
822
+ else:
823
+ return default_response
824
+
825
+ def export_to_html(
826
+ self,
827
+ doc: "DoclingDocument",
828
+ add_caption: bool = True,
829
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
830
+ ) -> str:
831
+ """Export picture to HTML format."""
832
+ text = ""
833
+ if add_caption and len(self.captions):
834
+ text = self.caption_text(doc)
835
+
836
+ caption_text = ""
837
+ if len(text) > 0:
838
+ caption_text = f"<figcaption>{text}</figcaption>"
839
+
840
+ default_response = f"<figure>{caption_text}</figure>"
841
+
842
+ if image_mode == ImageRefMode.PLACEHOLDER:
843
+ return default_response
844
+
845
+ elif image_mode == ImageRefMode.EMBEDDED:
846
+ # short-cut: we already have the image in base64
847
+ if (
848
+ isinstance(self.image, ImageRef)
849
+ and isinstance(self.image.uri, AnyUrl)
850
+ and self.image.uri.scheme == "data"
851
+ ):
852
+ img_text = f'<img src="{self.image.uri}">'
853
+ return f"<figure>{caption_text}{img_text}</figure>"
854
+
855
+ # get the self.image._pil or crop it out of the page-image
856
+ img = self.get_image(doc)
857
+
858
+ if img is not None:
859
+ imgb64 = self._image_to_base64(img)
860
+ img_text = f'<img src="data:image/png;base64,{imgb64}">'
861
+
862
+ return f"<figure>{caption_text}{img_text}</figure>"
863
+ else:
864
+ return default_response
865
+
866
+ elif image_mode == ImageRefMode.REFERENCED:
867
+
868
+ if not isinstance(self.image, ImageRef) or (
869
+ isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
870
+ ):
871
+ return default_response
872
+
873
+ if (
874
+ isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "file"
875
+ ) or isinstance(self.image.uri, Path):
876
+ img_text = f'<img src="{str(self.image.uri)}">'
877
+ return f"<figure>{caption_text}{img_text}</figure>"
878
+
879
+ else:
880
+ return default_response
881
+
882
+ else:
883
+ return default_response
884
+
680
885
  def export_to_document_tokens(
681
886
  self,
682
887
  doc: "DoclingDocument",
@@ -804,14 +1009,30 @@ class TableItem(FloatingItem):
804
1009
  )
805
1010
  return md_table
806
1011
 
807
- def export_to_html(self) -> str:
1012
+ def export_to_html(
1013
+ self, doc: Optional["DoclingDocument"] = None, add_caption: bool = True
1014
+ ) -> str:
808
1015
  """Export the table as html."""
1016
+ if doc is None:
1017
+ warnings.warn(
1018
+ "The `doc` argument will be mandatory in a future version. "
1019
+ "It must be provided to include a caption.",
1020
+ DeprecationWarning,
1021
+ )
1022
+
809
1023
  body = ""
810
1024
  nrows = self.data.num_rows
811
1025
  ncols = self.data.num_cols
812
1026
 
813
- if not len(self.data.table_cells):
1027
+ text = ""
1028
+ if doc is not None and add_caption and len(self.captions):
1029
+ text = self.caption_text(doc)
1030
+
1031
+ if len(self.data.table_cells) == 0:
814
1032
  return ""
1033
+
1034
+ body = ""
1035
+
815
1036
  for i in range(nrows):
816
1037
  body += "<tr>"
817
1038
  for j in range(ncols):
@@ -844,7 +1065,15 @@ class TableItem(FloatingItem):
844
1065
 
845
1066
  body += f"<{opening_tag}>{content}</{celltag}>"
846
1067
  body += "</tr>"
847
- body = f"<table>{body}</table>"
1068
+
1069
+ if len(text) > 0 and len(body) > 0:
1070
+ body = f"<table><caption>{text}</caption><tbody>{body}</tbody></table>"
1071
+ elif len(text) == 0 and len(body) > 0:
1072
+ body = f"<table><tbody>{body}</tbody></table>"
1073
+ elif len(text) > 0 and len(body) == 0:
1074
+ body = f"<table><caption>{text}</caption></table>"
1075
+ else:
1076
+ body = "<table></table>"
848
1077
 
849
1078
  return body
850
1079
 
@@ -981,6 +1210,23 @@ class PageItem(BaseModel):
981
1210
  class DoclingDocument(BaseModel):
982
1211
  """DoclingDocument."""
983
1212
 
1213
+ _HTML_DEFAULT_HEAD: str = r"""<head>
1214
+ <meta charset="UTF-8">
1215
+ <style>
1216
+ table {
1217
+ border-collapse: separate;
1218
+ /* Maintain separate borders */
1219
+ border-spacing: 5px; /*
1220
+ Space between cells */
1221
+ width: 50%;
1222
+ }
1223
+ th, td {
1224
+ border: 1px solid black;
1225
+ /* Add lines etween cells */
1226
+ padding: 8px; }
1227
+ </style>
1228
+ </head>"""
1229
+
984
1230
  schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
985
1231
  version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
986
1232
  CURRENT_VERSION
@@ -1045,7 +1291,7 @@ class DoclingDocument(BaseModel):
1045
1291
  prov: Optional[ProvenanceItem] = None,
1046
1292
  parent: Optional[GroupItem] = None,
1047
1293
  ):
1048
- """add_paragraph.
1294
+ """add_list_item.
1049
1295
 
1050
1296
  :param label: str:
1051
1297
  :param text: str:
@@ -1088,7 +1334,7 @@ class DoclingDocument(BaseModel):
1088
1334
  prov: Optional[ProvenanceItem] = None,
1089
1335
  parent: Optional[GroupItem] = None,
1090
1336
  ):
1091
- """add_paragraph.
1337
+ """add_text.
1092
1338
 
1093
1339
  :param label: str:
1094
1340
  :param text: str:
@@ -1097,28 +1343,41 @@ class DoclingDocument(BaseModel):
1097
1343
  :param parent: Optional[GroupItem]: (Default value = None)
1098
1344
 
1099
1345
  """
1100
- if not parent:
1101
- parent = self.body
1346
+ # Catch a few cases that are in principle allowed
1347
+ # but that will create confusion down the road
1348
+ if label in [DocItemLabel.TITLE]:
1349
+ return self.add_title(text=text, orig=orig, prov=prov, parent=parent)
1102
1350
 
1103
- if not orig:
1104
- orig = text
1351
+ elif label in [DocItemLabel.LIST_ITEM]:
1352
+ return self.add_list_item(text=text, orig=orig, prov=prov, parent=parent)
1105
1353
 
1106
- text_index = len(self.texts)
1107
- cref = f"#/texts/{text_index}"
1108
- text_item = TextItem(
1109
- label=label,
1110
- text=text,
1111
- orig=orig,
1112
- self_ref=cref,
1113
- parent=parent.get_ref(),
1114
- )
1115
- if prov:
1116
- text_item.prov.append(prov)
1354
+ elif label in [DocItemLabel.SECTION_HEADER]:
1355
+ return self.add_heading(text=text, orig=orig, prov=prov, parent=parent)
1117
1356
 
1118
- self.texts.append(text_item)
1119
- parent.children.append(RefItem(cref=cref))
1357
+ else:
1120
1358
 
1121
- return text_item
1359
+ if not parent:
1360
+ parent = self.body
1361
+
1362
+ if not orig:
1363
+ orig = text
1364
+
1365
+ text_index = len(self.texts)
1366
+ cref = f"#/texts/{text_index}"
1367
+ text_item = TextItem(
1368
+ label=label,
1369
+ text=text,
1370
+ orig=orig,
1371
+ self_ref=cref,
1372
+ parent=parent.get_ref(),
1373
+ )
1374
+ if prov:
1375
+ text_item.prov.append(prov)
1376
+
1377
+ self.texts.append(text_item)
1378
+ parent.children.append(RefItem(cref=cref))
1379
+
1380
+ return text_item
1122
1381
 
1123
1382
  def add_table(
1124
1383
  self,
@@ -1170,7 +1429,6 @@ class DoclingDocument(BaseModel):
1170
1429
  :param RefItem]]: (Default value = None)
1171
1430
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1172
1431
  :param parent: Optional[GroupItem]: (Default value = None)
1173
-
1174
1432
  """
1175
1433
  if not parent:
1176
1434
  parent = self.body
@@ -1195,6 +1453,43 @@ class DoclingDocument(BaseModel):
1195
1453
 
1196
1454
  return fig_item
1197
1455
 
1456
+ def add_title(
1457
+ self,
1458
+ text: str,
1459
+ orig: Optional[str] = None,
1460
+ prov: Optional[ProvenanceItem] = None,
1461
+ parent: Optional[GroupItem] = None,
1462
+ ):
1463
+ """add_title.
1464
+
1465
+ :param text: str:
1466
+ :param orig: Optional[str]: (Default value = None)
1467
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
1468
+ :param parent: Optional[GroupItem]: (Default value = None)
1469
+ """
1470
+ if not parent:
1471
+ parent = self.body
1472
+
1473
+ if not orig:
1474
+ orig = text
1475
+
1476
+ text_index = len(self.texts)
1477
+ cref = f"#/texts/{text_index}"
1478
+ text_item = TextItem(
1479
+ label=DocItemLabel.TITLE,
1480
+ text=text,
1481
+ orig=orig,
1482
+ self_ref=cref,
1483
+ parent=parent.get_ref(),
1484
+ )
1485
+ if prov:
1486
+ text_item.prov.append(prov)
1487
+
1488
+ self.texts.append(text_item)
1489
+ parent.children.append(RefItem(cref=cref))
1490
+
1491
+ return text_item
1492
+
1198
1493
  def add_heading(
1199
1494
  self,
1200
1495
  text: str,
@@ -1211,7 +1506,6 @@ class DoclingDocument(BaseModel):
1211
1506
  :param level: LevelNumber: (Default value = 1)
1212
1507
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1213
1508
  :param parent: Optional[GroupItem]: (Default value = None)
1214
-
1215
1509
  """
1216
1510
  if not parent:
1217
1511
  parent = self.body
@@ -1297,17 +1591,220 @@ class DoclingDocument(BaseModel):
1297
1591
  page_no=page_no,
1298
1592
  )
1299
1593
 
1594
+ def _clear_picture_pil_cache(self):
1595
+ """Clear cache storage of all images."""
1596
+ for item, level in self.iterate_items(with_groups=False):
1597
+ if isinstance(item, PictureItem):
1598
+ if item.image is not None and item.image._pil is not None:
1599
+ item.image._pil.close()
1600
+
1601
+ def _list_images_on_disk(self) -> List[Path]:
1602
+ """List all images on disk."""
1603
+ result: List[Path] = []
1604
+
1605
+ for item, level in self.iterate_items(with_groups=False):
1606
+ if isinstance(item, PictureItem):
1607
+ if item.image is not None:
1608
+ if (
1609
+ isinstance(item.image.uri, AnyUrl)
1610
+ and item.image.uri.scheme == "file"
1611
+ and item.image.uri.path is not None
1612
+ ):
1613
+ local_path = Path(unquote(item.image.uri.path))
1614
+ result.append(local_path)
1615
+ elif isinstance(item.image.uri, Path):
1616
+ result.append(item.image.uri)
1617
+
1618
+ return result
1619
+
1620
+ def _with_embedded_pictures(self) -> "DoclingDocument":
1621
+ """Document with embedded images.
1622
+
1623
+ Creates a copy of this document where all pictures referenced
1624
+ through a file URI are turned into base64 embedded form.
1625
+ """
1626
+ result: DoclingDocument = copy.deepcopy(self)
1627
+
1628
+ for ix, (item, level) in enumerate(result.iterate_items(with_groups=True)):
1629
+ if isinstance(item, PictureItem):
1630
+
1631
+ if item.image is not None:
1632
+ if (
1633
+ isinstance(item.image.uri, AnyUrl)
1634
+ and item.image.uri.scheme == "file"
1635
+ ):
1636
+ assert isinstance(item.image.uri.path, str)
1637
+ tmp_image = PILImage.open(str(unquote(item.image.uri.path)))
1638
+ item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
1639
+
1640
+ elif isinstance(item.image.uri, Path):
1641
+ tmp_image = PILImage.open(str(item.image.uri))
1642
+ item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
1643
+
1644
+ return result
1645
+
1646
+ def _with_pictures_refs(
1647
+ self, image_dir: Path, reference_path: Optional[Path] = None
1648
+ ) -> "DoclingDocument":
1649
+ """Document with images as refs.
1650
+
1651
+ Creates a copy of this document where all picture data is
1652
+ saved to image_dir and referenced through file URIs.
1653
+ """
1654
+ result: DoclingDocument = copy.deepcopy(self)
1655
+
1656
+ img_count = 0
1657
+ image_dir.mkdir(parents=True, exist_ok=True)
1658
+
1659
+ if image_dir.is_dir():
1660
+ for item, level in result.iterate_items(with_groups=False):
1661
+ if isinstance(item, PictureItem):
1662
+
1663
+ if (
1664
+ item.image is not None
1665
+ and isinstance(item.image.uri, AnyUrl)
1666
+ and item.image.uri.scheme == "data"
1667
+ and item.image.pil_image is not None
1668
+ ):
1669
+ img = item.image.pil_image
1670
+
1671
+ hexhash = item._image_to_hexhash()
1672
+
1673
+ # loc_path = image_dir / f"image_{img_count:06}.png"
1674
+ if hexhash is not None:
1675
+ loc_path = image_dir / f"image_{img_count:06}_{hexhash}.png"
1676
+
1677
+ img.save(loc_path)
1678
+ if reference_path is not None:
1679
+ obj_path = relative_path(
1680
+ reference_path.resolve(), loc_path.resolve()
1681
+ )
1682
+ else:
1683
+ obj_path = loc_path
1684
+
1685
+ item.image.uri = Path(obj_path)
1686
+
1687
+ # if item.image._pil is not None:
1688
+ # item.image._pil.close()
1689
+
1690
+ img_count += 1
1691
+
1692
+ return result
1693
+
1300
1694
  def print_element_tree(self):
1301
- """print_element_tree."""
1695
+ """Print_element_tree."""
1302
1696
  for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
1303
1697
  if isinstance(item, GroupItem):
1304
1698
  print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
1305
1699
  elif isinstance(item, DocItem):
1306
1700
  print(" " * level, f"{ix}: {item.label.value}")
1307
1701
 
1308
- def export_to_dict(self) -> Dict:
1309
- """export_to_dict."""
1310
- return self.model_dump(mode="json", by_alias=True, exclude_none=True)
1702
+ def export_to_element_tree(self) -> str:
1703
+ """Export_to_element_tree."""
1704
+ texts = []
1705
+ for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
1706
+ if isinstance(item, GroupItem):
1707
+ texts.append(
1708
+ " " * level + f"{ix}: {item.label.value} with name={item.name}"
1709
+ )
1710
+ elif isinstance(item, DocItem):
1711
+ texts.append(" " * level + f"{ix}: {item.label.value}")
1712
+
1713
+ return "\n".join(texts)
1714
+
1715
+ def save_as_json(
1716
+ self,
1717
+ filename: Path,
1718
+ artifacts_dir: Optional[Path] = None,
1719
+ image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
1720
+ indent: int = 2,
1721
+ ):
1722
+ """Save as json."""
1723
+ artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
1724
+
1725
+ if image_mode == ImageRefMode.REFERENCED:
1726
+ os.makedirs(artifacts_dir, exist_ok=True)
1727
+
1728
+ new_doc = self._make_copy_with_refmode(
1729
+ artifacts_dir, image_mode, reference_path=reference_path
1730
+ )
1731
+
1732
+ out = new_doc.export_to_dict()
1733
+ with open(filename, "w") as fw:
1734
+ json.dump(out, fw, indent=indent)
1735
+
1736
+ def save_as_yaml(
1737
+ self,
1738
+ filename: Path,
1739
+ artifacts_dir: Optional[Path] = None,
1740
+ image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
1741
+ default_flow_style: bool = False,
1742
+ ):
1743
+ """Save as yaml."""
1744
+ artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
1745
+
1746
+ if image_mode == ImageRefMode.REFERENCED:
1747
+ os.makedirs(artifacts_dir, exist_ok=True)
1748
+
1749
+ new_doc = self._make_copy_with_refmode(
1750
+ artifacts_dir, image_mode, reference_path=reference_path
1751
+ )
1752
+
1753
+ out = new_doc.export_to_dict()
1754
+ with open(filename, "w") as fw:
1755
+ yaml.dump(out, fw, default_flow_style=default_flow_style)
1756
+
1757
+ def export_to_dict(
1758
+ self,
1759
+ mode: str = "json",
1760
+ by_alias: bool = True,
1761
+ exclude_none: bool = True,
1762
+ ) -> Dict:
1763
+ """Export to dict."""
1764
+ out = self.model_dump(mode=mode, by_alias=by_alias, exclude_none=exclude_none)
1765
+
1766
+ return out
1767
+
1768
+ def save_as_markdown(
1769
+ self,
1770
+ filename: Path,
1771
+ artifacts_dir: Optional[Path] = None,
1772
+ delim: str = "\n",
1773
+ from_element: int = 0,
1774
+ to_element: int = sys.maxsize,
1775
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1776
+ strict_text: bool = False,
1777
+ image_placeholder: str = "<!-- image -->",
1778
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
1779
+ indent: int = 4,
1780
+ text_width: int = -1,
1781
+ page_no: Optional[int] = None,
1782
+ ):
1783
+ """Save to markdown."""
1784
+ artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
1785
+
1786
+ if image_mode == ImageRefMode.REFERENCED:
1787
+ os.makedirs(artifacts_dir, exist_ok=True)
1788
+
1789
+ new_doc = self._make_copy_with_refmode(
1790
+ artifacts_dir, image_mode, reference_path=reference_path
1791
+ )
1792
+
1793
+ md_out = new_doc.export_to_markdown(
1794
+ delim=delim,
1795
+ from_element=from_element,
1796
+ to_element=to_element,
1797
+ labels=labels,
1798
+ strict_text=strict_text,
1799
+ image_placeholder=image_placeholder,
1800
+ image_mode=image_mode,
1801
+ indent=indent,
1802
+ text_width=text_width,
1803
+ page_no=page_no,
1804
+ )
1805
+
1806
+ with open(filename, "w") as fw:
1807
+ fw.write(md_out)
1311
1808
 
1312
1809
  def export_to_markdown( # noqa: C901
1313
1810
  self,
@@ -1461,22 +1958,13 @@ class DoclingDocument(BaseModel):
1461
1958
  in_list = False
1462
1959
  mdtexts.append(item.caption_text(self))
1463
1960
 
1464
- if image_mode == ImageRefMode.PLACEHOLDER:
1465
- mdtexts.append("\n" + image_placeholder + "\n")
1466
- elif image_mode == ImageRefMode.EMBEDDED and isinstance(
1467
- item.image, ImageRef
1468
- ):
1469
- text = f"![Local Image]({item.image.uri})\n"
1470
- mdtexts.append(text)
1471
- elif image_mode == ImageRefMode.EMBEDDED and not isinstance(
1472
- item.image, ImageRef
1473
- ):
1474
- text = (
1475
- "<!-- 🖼️❌ Image not available. "
1476
- "Please use `PdfPipelineOptions(generate_picture_images=True)`"
1477
- " --> "
1478
- )
1479
- mdtexts.append(text)
1961
+ line = item.export_to_markdown(
1962
+ doc=self,
1963
+ image_placeholder=image_placeholder,
1964
+ image_mode=image_mode,
1965
+ )
1966
+
1967
+ mdtexts.append(line)
1480
1968
 
1481
1969
  elif isinstance(item, DocItem) and item.label in labels:
1482
1970
  in_list = False
@@ -1518,11 +2006,288 @@ class DoclingDocument(BaseModel):
1518
2006
  image_placeholder="",
1519
2007
  )
1520
2008
 
1521
- def export_to_document_tokens(
2009
+ def save_as_html(
1522
2010
  self,
2011
+ filename: Path,
2012
+ artifacts_dir: Optional[Path] = None,
2013
+ from_element: int = 0,
2014
+ to_element: int = sys.maxsize,
2015
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2016
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2017
+ page_no: Optional[int] = None,
2018
+ html_lang: str = "en",
2019
+ html_head: str = _HTML_DEFAULT_HEAD,
2020
+ ):
2021
+ """Save to HTML."""
2022
+ artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
2023
+
2024
+ if image_mode == ImageRefMode.REFERENCED:
2025
+ os.makedirs(artifacts_dir, exist_ok=True)
2026
+
2027
+ new_doc = self._make_copy_with_refmode(
2028
+ artifacts_dir, image_mode, reference_path=reference_path
2029
+ )
2030
+
2031
+ html_out = new_doc.export_to_html(
2032
+ from_element=from_element,
2033
+ to_element=to_element,
2034
+ labels=labels,
2035
+ image_mode=image_mode,
2036
+ page_no=page_no,
2037
+ html_lang=html_lang,
2038
+ html_head=html_head,
2039
+ )
2040
+
2041
+ with open(filename, "w") as fw:
2042
+ fw.write(html_out)
2043
+
2044
+ def _get_output_paths(
2045
+ self, filename: Path, artifacts_dir: Optional[Path] = None
2046
+ ) -> Tuple[Path, Optional[Path]]:
2047
+ if artifacts_dir is None:
2048
+ # Remove the extension and add '_pictures'
2049
+ artifacts_dir = filename.with_suffix("")
2050
+ artifacts_dir = artifacts_dir.with_name(artifacts_dir.name + "_artifacts")
2051
+ if artifacts_dir.is_absolute():
2052
+ reference_path = None
2053
+ else:
2054
+ reference_path = filename.parent
2055
+ return artifacts_dir, reference_path
2056
+
2057
+ def _make_copy_with_refmode(
2058
+ self,
2059
+ artifacts_dir: Path,
2060
+ image_mode: ImageRefMode,
2061
+ reference_path: Optional[Path] = None,
2062
+ ):
2063
+ new_doc = None
2064
+ if image_mode == ImageRefMode.PLACEHOLDER:
2065
+ new_doc = self
2066
+ elif image_mode == ImageRefMode.REFERENCED:
2067
+ new_doc = self._with_pictures_refs(
2068
+ image_dir=artifacts_dir, reference_path=reference_path
2069
+ )
2070
+ elif image_mode == ImageRefMode.EMBEDDED:
2071
+ new_doc = self._with_embedded_pictures()
2072
+ else:
2073
+ raise ValueError("Unsupported ImageRefMode")
2074
+ return new_doc
2075
+
2076
+ def export_to_html( # noqa: C901
2077
+ self,
2078
+ from_element: int = 0,
2079
+ to_element: int = sys.maxsize,
2080
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2081
+ image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2082
+ page_no: Optional[int] = None,
2083
+ html_lang: str = "en",
2084
+ html_head: str = _HTML_DEFAULT_HEAD,
2085
+ ) -> str:
2086
+ r"""Serialize to HTML."""
2087
+
2088
+ def close_lists(
2089
+ curr_level: int,
2090
+ prev_level: int,
2091
+ in_ordered_list: List[bool],
2092
+ html_texts: list[str],
2093
+ ):
2094
+
2095
+ if len(in_ordered_list) == 0:
2096
+ return (in_ordered_list, html_texts)
2097
+
2098
+ while curr_level < prev_level and len(in_ordered_list) > 0:
2099
+ if in_ordered_list[-1]:
2100
+ html_texts.append("</ol>")
2101
+ else:
2102
+ html_texts.append("</ul>")
2103
+
2104
+ prev_level -= 1
2105
+ in_ordered_list.pop() # = in_ordered_list[:-1]
2106
+
2107
+ return (in_ordered_list, html_texts)
2108
+
2109
+ head_lines = ["<!DOCTYPE html>", f'<html lang="{html_lang}">', html_head]
2110
+ html_texts: list[str] = []
2111
+
2112
+ prev_level = 0 # Track the previous item's level
2113
+
2114
+ in_ordered_list: List[bool] = [] # False
2115
+
2116
+ for ix, (item, curr_level) in enumerate(
2117
+ self.iterate_items(self.body, with_groups=True, page_no=page_no)
2118
+ ):
2119
+ # If we've moved to a lower level, we're exiting one or more groups
2120
+ if curr_level < prev_level and len(in_ordered_list) > 0:
2121
+ # Calculate how many levels we've exited
2122
+ # level_difference = previous_level - level
2123
+ # Decrement list_nesting_level for each list group we've exited
2124
+ # list_nesting_level = max(0, list_nesting_level - level_difference)
2125
+
2126
+ in_ordered_list, html_texts = close_lists(
2127
+ curr_level=curr_level,
2128
+ prev_level=prev_level,
2129
+ in_ordered_list=in_ordered_list,
2130
+ html_texts=html_texts,
2131
+ )
2132
+
2133
+ prev_level = curr_level # Update previous_level for next iteration
2134
+
2135
+ if ix < from_element or to_element <= ix:
2136
+ continue # skip as many items as you want
2137
+
2138
+ if (isinstance(item, DocItem)) and (item.label not in labels):
2139
+ continue # skip any label that is not whitelisted
2140
+
2141
+ if isinstance(item, GroupItem) and item.label in [
2142
+ GroupLabel.ORDERED_LIST,
2143
+ ]:
2144
+
2145
+ text = "<ol>"
2146
+ html_texts.append(text.strip())
2147
+
2148
+ # Increment list nesting level when entering a new list
2149
+ in_ordered_list.append(True)
2150
+
2151
+ elif isinstance(item, GroupItem) and item.label in [
2152
+ GroupLabel.LIST,
2153
+ ]:
2154
+
2155
+ text = "<ul>"
2156
+ html_texts.append(text.strip())
2157
+
2158
+ # Increment list nesting level when entering a new list
2159
+ in_ordered_list.append(False)
2160
+
2161
+ elif isinstance(item, GroupItem):
2162
+ continue
2163
+
2164
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2165
+
2166
+ text = f"<h1>{item.text}</h1>"
2167
+ html_texts.append(text.strip())
2168
+
2169
+ elif isinstance(item, SectionHeaderItem):
2170
+
2171
+ section_level: int = item.level + 1
2172
+
2173
+ text = f"<h{(section_level)}>{item.text}</h{(section_level)}>"
2174
+ html_texts.append(text.strip())
2175
+
2176
+ elif isinstance(item, TextItem) and item.label in [
2177
+ DocItemLabel.SECTION_HEADER
2178
+ ]:
2179
+
2180
+ section_level = curr_level
2181
+
2182
+ if section_level <= 1:
2183
+ section_level = 2
2184
+
2185
+ if section_level >= 6:
2186
+ section_level = 6
2187
+
2188
+ text = f"<h{section_level}>{item.text}</h{section_level}>"
2189
+ html_texts.append(text.strip())
2190
+
2191
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
2192
+
2193
+ text = f"<pre>{item.text}</pre>"
2194
+ html_texts.append(text)
2195
+
2196
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
2197
+ # captions are printed in picture and table ... skipping for now
2198
+ continue
2199
+
2200
+ elif isinstance(item, ListItem):
2201
+
2202
+ text = f"<li>{item.text}</li>"
2203
+ html_texts.append(text)
2204
+
2205
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
2206
+
2207
+ text = f"<li>{item.text}</li>"
2208
+ html_texts.append(text)
2209
+
2210
+ elif isinstance(item, TextItem) and item.label in labels:
2211
+
2212
+ text = f"<p>{item.text}</p>"
2213
+ html_texts.append(text.strip())
2214
+
2215
+ elif isinstance(item, TableItem):
2216
+
2217
+ text = item.export_to_html(doc=self, add_caption=True)
2218
+ html_texts.append(text)
2219
+
2220
+ elif isinstance(item, PictureItem):
2221
+
2222
+ html_texts.append(
2223
+ item.export_to_html(
2224
+ doc=self, add_caption=True, image_mode=image_mode
2225
+ )
2226
+ )
2227
+
2228
+ elif isinstance(item, DocItem) and item.label in labels:
2229
+ continue
2230
+
2231
+ html_texts.append("</html>")
2232
+
2233
+ lines = []
2234
+ lines.extend(head_lines)
2235
+ for i, line in enumerate(html_texts):
2236
+ lines.append(line.replace("\n", "<br>"))
2237
+
2238
+ delim = "\n"
2239
+ html_text = (delim.join(lines)).strip()
2240
+
2241
+ return html_text
2242
+
2243
+ def save_as_document_tokens(
2244
+ self,
2245
+ filename: Path,
1523
2246
  delim: str = "\n\n",
1524
2247
  from_element: int = 0,
1525
- to_element: Optional[int] = None,
2248
+ to_element: int = sys.maxsize,
2249
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2250
+ xsize: int = 100,
2251
+ ysize: int = 100,
2252
+ add_location: bool = True,
2253
+ add_content: bool = True,
2254
+ add_page_index: bool = True,
2255
+ # table specific flags
2256
+ add_table_cell_location: bool = False,
2257
+ add_table_cell_label: bool = True,
2258
+ add_table_cell_text: bool = True,
2259
+ # specifics
2260
+ page_no: Optional[int] = None,
2261
+ with_groups: bool = True,
2262
+ ):
2263
+ r"""Save the document content to a DocumentToken format."""
2264
+ out = self.export_to_document_tokens(
2265
+ delim=delim,
2266
+ from_element=from_element,
2267
+ to_element=to_element,
2268
+ labels=labels,
2269
+ xsize=xsize,
2270
+ ysize=ysize,
2271
+ add_location=add_location,
2272
+ add_content=add_content,
2273
+ add_page_index=add_page_index,
2274
+ # table specific flags
2275
+ add_table_cell_location=add_table_cell_location,
2276
+ add_table_cell_label=add_table_cell_label,
2277
+ add_table_cell_text=add_table_cell_text,
2278
+ # specifics
2279
+ page_no=page_no,
2280
+ with_groups=with_groups,
2281
+ )
2282
+
2283
+ with open(filename, "w") as fw:
2284
+ fw.write(out)
2285
+
2286
+ def export_to_document_tokens(
2287
+ self,
2288
+ delim: str = "\n",
2289
+ from_element: int = 0,
2290
+ to_element: int = sys.maxsize,
1526
2291
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1527
2292
  xsize: int = 100,
1528
2293
  ysize: int = 100,
@@ -1533,8 +2298,12 @@ class DoclingDocument(BaseModel):
1533
2298
  add_table_cell_location: bool = False,
1534
2299
  add_table_cell_label: bool = True,
1535
2300
  add_table_cell_text: bool = True,
2301
+ # specifics
2302
+ page_no: Optional[int] = None,
2303
+ with_groups: bool = True,
2304
+ newline: bool = True,
1536
2305
  ) -> str:
1537
- r"""Exports the document content to an DocumentToken format.
2306
+ r"""Exports the document content to a DocumentToken format.
1538
2307
 
1539
2308
  Operates on a slice of the document's body as defined through arguments
1540
2309
  from_element and to_element; defaulting to the whole main_text.
@@ -1554,44 +2323,102 @@ class DoclingDocument(BaseModel):
1554
2323
  :returns: The content of the document formatted as a DocTags string.
1555
2324
  :rtype: str
1556
2325
  """
1557
- new_line = ""
1558
- if delim:
1559
- new_line = "\n"
1560
2326
 
1561
- doctags = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
2327
+ def close_lists(
2328
+ curr_level: int,
2329
+ prev_level: int,
2330
+ in_ordered_list: List[bool],
2331
+ result: str,
2332
+ delim: str,
2333
+ ):
2334
+
2335
+ if len(in_ordered_list) == 0:
2336
+ return (in_ordered_list, result)
2337
+
2338
+ while curr_level < prev_level and len(in_ordered_list) > 0:
2339
+ if in_ordered_list[-1]:
2340
+ result += f"</ordered_list>{delim}"
2341
+ else:
2342
+ result += f"</unordered_list>{delim}"
1562
2343
 
1563
- # pagedims = self.get_map_to_page_dimensions()
2344
+ prev_level -= 1
2345
+ in_ordered_list.pop() # = in_ordered_list[:-1]
2346
+
2347
+ return (in_ordered_list, result)
2348
+
2349
+ if newline:
2350
+ delim = "\n"
2351
+ else:
2352
+ delim = ""
1564
2353
 
1565
- skip_count = 0
1566
- for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1567
- if skip_count < from_element:
1568
- skip_count += 1
2354
+ prev_level = 0 # Track the previous item's level
2355
+
2356
+ in_ordered_list: List[bool] = [] # False
2357
+
2358
+ result = f"{DocumentToken.BEG_DOCUMENT.value}{delim}"
2359
+
2360
+ for ix, (item, curr_level) in enumerate(
2361
+ self.iterate_items(self.body, with_groups=True)
2362
+ ):
2363
+
2364
+ # If we've moved to a lower level, we're exiting one or more groups
2365
+ if curr_level < prev_level and len(in_ordered_list) > 0:
2366
+ # Calculate how many levels we've exited
2367
+ # level_difference = previous_level - level
2368
+ # Decrement list_nesting_level for each list group we've exited
2369
+ # list_nesting_level = max(0, list_nesting_level - level_difference)
2370
+
2371
+ in_ordered_list, result = close_lists(
2372
+ curr_level=curr_level,
2373
+ prev_level=prev_level,
2374
+ in_ordered_list=in_ordered_list,
2375
+ result=result,
2376
+ delim=delim,
2377
+ )
2378
+
2379
+ prev_level = curr_level # Update previous_level for next iteration
2380
+
2381
+ if ix < from_element or to_element <= ix:
1569
2382
  continue # skip as many items as you want
1570
2383
 
1571
- if to_element and ix >= to_element:
1572
- break
2384
+ if (isinstance(item, DocItem)) and (item.label not in labels):
2385
+ continue # skip any label that is not whitelisted
1573
2386
 
1574
- if not isinstance(item, DocItem):
1575
- continue
2387
+ if isinstance(item, GroupItem) and item.label in [
2388
+ GroupLabel.ORDERED_LIST,
2389
+ ]:
1576
2390
 
1577
- prov = item.prov
2391
+ result += f"<ordered_list>{delim}"
2392
+ in_ordered_list.append(True)
1578
2393
 
1579
- page_i = -1
2394
+ elif isinstance(item, GroupItem) and item.label in [
2395
+ GroupLabel.LIST,
2396
+ ]:
1580
2397
 
1581
- if add_location and len(self.pages) and len(prov) > 0:
2398
+ result += f"<unordered_list>{delim}"
2399
+ in_ordered_list.append(False)
1582
2400
 
1583
- page_i = prov[0].page_no
1584
- page_dim = self.pages[page_i].size
2401
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
2402
+ # captions are printed in picture and table ... skipping for now
2403
+ continue
1585
2404
 
1586
- float(page_dim.width)
1587
- float(page_dim.height)
2405
+ elif isinstance(item, SectionHeaderItem):
2406
+
2407
+ result += item.export_to_document_tokens(
2408
+ doc=self,
2409
+ new_line=delim,
2410
+ xsize=xsize,
2411
+ ysize=ysize,
2412
+ add_location=add_location,
2413
+ add_content=add_content,
2414
+ add_page_index=add_page_index,
2415
+ )
1588
2416
 
1589
- item_type = item.label
1590
- if isinstance(item, TextItem) and (item_type in labels):
2417
+ elif isinstance(item, TextItem) and (item.label in labels):
1591
2418
 
1592
- doctags += item.export_to_document_tokens(
2419
+ result += item.export_to_document_tokens(
1593
2420
  doc=self,
1594
- new_line=new_line,
2421
+ new_line=delim,
1595
2422
  xsize=xsize,
1596
2423
  ysize=ysize,
1597
2424
  add_location=add_location,
@@ -1599,11 +2426,11 @@ class DoclingDocument(BaseModel):
1599
2426
  add_page_index=add_page_index,
1600
2427
  )
1601
2428
 
1602
- elif isinstance(item, TableItem) and (item_type in labels):
2429
+ elif isinstance(item, TableItem) and (item.label in labels):
1603
2430
 
1604
- doctags += item.export_to_document_tokens(
2431
+ result += item.export_to_document_tokens(
1605
2432
  doc=self,
1606
- new_line=new_line,
2433
+ new_line=delim,
1607
2434
  xsize=xsize,
1608
2435
  ysize=ysize,
1609
2436
  add_caption=True,
@@ -1615,11 +2442,11 @@ class DoclingDocument(BaseModel):
1615
2442
  add_page_index=add_page_index,
1616
2443
  )
1617
2444
 
1618
- elif isinstance(item, PictureItem) and (item_type in labels):
2445
+ elif isinstance(item, PictureItem) and (item.label in labels):
1619
2446
 
1620
- doctags += item.export_to_document_tokens(
2447
+ result += item.export_to_document_tokens(
1621
2448
  doc=self,
1622
- new_line=new_line,
2449
+ new_line=delim,
1623
2450
  xsize=xsize,
1624
2451
  ysize=ysize,
1625
2452
  add_caption=True,
@@ -1628,9 +2455,9 @@ class DoclingDocument(BaseModel):
1628
2455
  add_page_index=add_page_index,
1629
2456
  )
1630
2457
 
1631
- doctags += DocumentToken.END_DOCUMENT.value
2458
+ result += DocumentToken.END_DOCUMENT.value
1632
2459
 
1633
- return doctags
2460
+ return result
1634
2461
 
1635
2462
  def _export_to_indented_text(
1636
2463
  self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
@@ -65,3 +65,43 @@ def resolve_file_source(
65
65
  except ValidationError:
66
66
  raise ValueError(f"Unexpected source type encountered: {type(source)}")
67
67
  return local_path
68
+
69
+
70
+ def relative_path(src: Path, target: Path) -> Path:
71
+ """Compute the relative path from `src` to `target`.
72
+
73
+ Args:
74
+ src (str | Path): The source directory or file path (must be absolute).
75
+ target (str | Path): The target directory or file path (must be absolute).
76
+
77
+ Returns:
78
+ Path: The relative path from `src` to `target`.
79
+
80
+ Raises:
81
+ ValueError: If either `src` or `target` is not an absolute path.
82
+ """
83
+ src = Path(src).resolve()
84
+ target = Path(target).resolve()
85
+
86
+ # Ensure both paths are absolute
87
+ if not src.is_absolute():
88
+ raise ValueError(f"The source path must be absolute: {src}")
89
+ if not target.is_absolute():
90
+ raise ValueError(f"The target path must be absolute: {target}")
91
+
92
+ # Find the common ancestor
93
+ common_parts = []
94
+ for src_part, target_part in zip(src.parts, target.parts):
95
+ if src_part == target_part:
96
+ common_parts.append(src_part)
97
+ else:
98
+ break
99
+
100
+ # Determine the path to go up from src to the common ancestor
101
+ up_segments = [".."] * (len(src.parts) - len(common_parts))
102
+
103
+ # Add the path from the common ancestor to the target
104
+ down_segments = target.parts[len(common_parts) :]
105
+
106
+ # Combine and return the result
107
+ return Path(*up_segments, *down_segments)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.4.1
3
+ Version: 2.5.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -30,6 +30,7 @@ Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
30
30
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
31
31
  Requires-Dist: pillow (>=10.3.0,<11.0.0)
32
32
  Requires-Dist: pydantic (>=2.6.0,<2.10)
33
+ Requires-Dist: pyyaml (>=5.1,<7.0.0)
33
34
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
34
35
  Project-URL: Repository, https://github.com/DS4SD/docling-core
35
36
  Description-Content-Type: text/markdown
@@ -20,8 +20,8 @@ docling_core/transforms/chunker/hierarchical_chunker.py,sha256=V4FiOYqL0GgBqVB7x
20
20
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
21
21
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
22
22
  docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
23
- docling_core/types/doc/base.py,sha256=zvx631U_yQCcJam83hNdDanXEYnO3eN-CCw9vDr6S-I,4442
24
- docling_core/types/doc/document.py,sha256=6KeHY4yl4Ry5nT6wacb8ujJ5LnyEZohXG5MAGhoPWGY,57771
23
+ docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
24
+ docling_core/types/doc/document.py,sha256=apWwh2ixsVc0axtqJec3xKNuYmEwFDB00fQ2vJdKgBA,86018
25
25
  docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
26
26
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
27
27
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
@@ -44,13 +44,13 @@ docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiK
44
44
  docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
45
45
  docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
46
46
  docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
47
- docling_core/utils/file.py,sha256=rZ3kaIpX2ZGxtaSXtqjcrivtXvsbeUolLXT-nntQ5yE,2388
47
+ docling_core/utils/file.py,sha256=ug4-z0KuthkEb_d5YDRPbY79PWfNSj9GYsi16xF2sDA,3699
48
48
  docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
49
49
  docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
50
50
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
51
51
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
52
- docling_core-2.4.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
53
- docling_core-2.4.1.dist-info/METADATA,sha256=pvQxFa3NHJeo7DVw0G-Ew-cIICcca7Eu4bDhcst5YfU,5431
54
- docling_core-2.4.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
- docling_core-2.4.1.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
56
- docling_core-2.4.1.dist-info/RECORD,,
52
+ docling_core-2.5.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
53
+ docling_core-2.5.1.dist-info/METADATA,sha256=9K3Hip_Uev5copWGL0ragXG-N5uFHQiF2SNk0se2m_o,5468
54
+ docling_core-2.5.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
+ docling_core-2.5.1.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
56
+ docling_core-2.5.1.dist-info/RECORD,,