docling-core 2.4.1__py3-none-any.whl → 2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/types/doc/base.py +3 -2
- docling_core/types/doc/document.py +915 -88
- docling_core/utils/file.py +40 -0
- {docling_core-2.4.1.dist-info → docling_core-2.5.1.dist-info}/METADATA +2 -1
- {docling_core-2.4.1.dist-info → docling_core-2.5.1.dist-info}/RECORD +8 -8
- {docling_core-2.4.1.dist-info → docling_core-2.5.1.dist-info}/LICENSE +0 -0
- {docling_core-2.4.1.dist-info → docling_core-2.5.1.dist-info}/WHEEL +0 -0
- {docling_core-2.4.1.dist-info → docling_core-2.5.1.dist-info}/entry_points.txt +0 -0
docling_core/types/doc/base.py
CHANGED
|
@@ -10,8 +10,9 @@ from pydantic import BaseModel
|
|
|
10
10
|
class ImageRefMode(str, Enum):
|
|
11
11
|
"""ImageRefMode."""
|
|
12
12
|
|
|
13
|
-
PLACEHOLDER = "placeholder"
|
|
14
|
-
EMBEDDED = "embedded"
|
|
13
|
+
PLACEHOLDER = "placeholder" # just a place-holder
|
|
14
|
+
EMBEDDED = "embedded" # embed the image as a base64
|
|
15
|
+
REFERENCED = "referenced" # reference the image via uri
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class CoordOrigin(str, Enum):
|
|
@@ -1,15 +1,23 @@
|
|
|
1
1
|
"""Models for the Docling Document data type."""
|
|
2
2
|
|
|
3
3
|
import base64
|
|
4
|
+
import copy
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
4
7
|
import mimetypes
|
|
8
|
+
import os
|
|
5
9
|
import re
|
|
6
10
|
import sys
|
|
7
11
|
import textwrap
|
|
8
12
|
import typing
|
|
13
|
+
import warnings
|
|
9
14
|
from io import BytesIO
|
|
15
|
+
from pathlib import Path
|
|
10
16
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
17
|
+
from urllib.parse import unquote
|
|
11
18
|
|
|
12
19
|
import pandas as pd
|
|
20
|
+
import yaml
|
|
13
21
|
from PIL import Image as PILImage
|
|
14
22
|
from pydantic import (
|
|
15
23
|
AnyUrl,
|
|
@@ -30,6 +38,7 @@ from docling_core.types.doc import BoundingBox, Size
|
|
|
30
38
|
from docling_core.types.doc.base import ImageRefMode
|
|
31
39
|
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
|
|
32
40
|
from docling_core.types.legacy_doc.tokens import DocumentToken
|
|
41
|
+
from docling_core.utils.file import relative_path
|
|
33
42
|
|
|
34
43
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
35
44
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
@@ -436,21 +445,25 @@ class ImageRef(BaseModel):
|
|
|
436
445
|
mimetype: str
|
|
437
446
|
dpi: int
|
|
438
447
|
size: Size
|
|
439
|
-
uri: AnyUrl
|
|
448
|
+
uri: Union[AnyUrl, Path]
|
|
440
449
|
_pil: Optional[PILImage.Image] = None
|
|
441
450
|
|
|
442
451
|
@property
|
|
443
|
-
def pil_image(self) -> PILImage.Image:
|
|
452
|
+
def pil_image(self) -> Optional[PILImage.Image]:
|
|
444
453
|
"""Return the PIL Image."""
|
|
445
454
|
if self._pil is not None:
|
|
446
455
|
return self._pil
|
|
447
456
|
|
|
448
|
-
if
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
self.
|
|
457
|
+
if isinstance(self.uri, AnyUrl):
|
|
458
|
+
if self.uri.scheme == "data":
|
|
459
|
+
encoded_img = str(self.uri).split(",")[1]
|
|
460
|
+
decoded_img = base64.b64decode(encoded_img)
|
|
461
|
+
self._pil = PILImage.open(BytesIO(decoded_img))
|
|
462
|
+
elif self.uri.scheme == "file":
|
|
463
|
+
self._pil = PILImage.open(unquote(str(self.uri.path)))
|
|
464
|
+
# else: Handle http request or other protocols...
|
|
465
|
+
elif isinstance(self.uri, Path):
|
|
466
|
+
self._pil = PILImage.open(self.uri)
|
|
454
467
|
|
|
455
468
|
return self._pil
|
|
456
469
|
|
|
@@ -566,6 +579,8 @@ class DocItem(
|
|
|
566
579
|
return None
|
|
567
580
|
|
|
568
581
|
page_image = page.image.pil_image
|
|
582
|
+
if not page_image:
|
|
583
|
+
return None
|
|
569
584
|
crop_bbox = (
|
|
570
585
|
self.prov[0]
|
|
571
586
|
.bbox.to_top_left_origin(page_height=page.size.height)
|
|
@@ -631,6 +646,50 @@ class SectionHeaderItem(TextItem):
|
|
|
631
646
|
label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
|
|
632
647
|
level: LevelNumber
|
|
633
648
|
|
|
649
|
+
def export_to_document_tokens(
|
|
650
|
+
self,
|
|
651
|
+
doc: "DoclingDocument",
|
|
652
|
+
new_line: str = "\n",
|
|
653
|
+
xsize: int = 100,
|
|
654
|
+
ysize: int = 100,
|
|
655
|
+
add_location: bool = True,
|
|
656
|
+
add_content: bool = True,
|
|
657
|
+
add_page_index: bool = True,
|
|
658
|
+
):
|
|
659
|
+
r"""Export text element to document tokens format.
|
|
660
|
+
|
|
661
|
+
:param doc: "DoclingDocument":
|
|
662
|
+
:param new_line: str: (Default value = "\n")
|
|
663
|
+
:param xsize: int: (Default value = 100)
|
|
664
|
+
:param ysize: int: (Default value = 100)
|
|
665
|
+
:param add_location: bool: (Default value = True)
|
|
666
|
+
:param add_content: bool: (Default value = True)
|
|
667
|
+
:param add_page_index: bool: (Default value = True)
|
|
668
|
+
|
|
669
|
+
"""
|
|
670
|
+
body = f"<{self.label.value}_level_{self.level}>"
|
|
671
|
+
|
|
672
|
+
# TODO: This must be done through an explicit mapping.
|
|
673
|
+
# assert DocumentToken.is_known_token(
|
|
674
|
+
# body
|
|
675
|
+
# ), f"failed DocumentToken.is_known_token({body})"
|
|
676
|
+
|
|
677
|
+
if add_location:
|
|
678
|
+
body += self.get_location_tokens(
|
|
679
|
+
doc=doc,
|
|
680
|
+
new_line="",
|
|
681
|
+
xsize=xsize,
|
|
682
|
+
ysize=ysize,
|
|
683
|
+
add_page_index=add_page_index,
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
if add_content and self.text is not None:
|
|
687
|
+
body += self.text.strip()
|
|
688
|
+
|
|
689
|
+
body += f"</{self.label.value}_level_{self.level}>{new_line}"
|
|
690
|
+
|
|
691
|
+
return body
|
|
692
|
+
|
|
634
693
|
|
|
635
694
|
class ListItem(TextItem):
|
|
636
695
|
"""SectionItem."""
|
|
@@ -677,6 +736,152 @@ class PictureItem(FloatingItem):
|
|
|
677
736
|
|
|
678
737
|
annotations: List[PictureDataType] = []
|
|
679
738
|
|
|
739
|
+
# Convert the image to Base64
|
|
740
|
+
def _image_to_base64(self, pil_image, format="PNG"):
|
|
741
|
+
"""Base64 representation of the image."""
|
|
742
|
+
buffered = BytesIO()
|
|
743
|
+
pil_image.save(buffered, format=format) # Save the image to the byte stream
|
|
744
|
+
img_bytes = buffered.getvalue() # Get the byte data
|
|
745
|
+
img_base64 = base64.b64encode(img_bytes).decode(
|
|
746
|
+
"utf-8"
|
|
747
|
+
) # Encode to Base64 and decode to string
|
|
748
|
+
return img_base64
|
|
749
|
+
|
|
750
|
+
def _image_to_hexhash(self) -> Optional[str]:
|
|
751
|
+
"""Hexash from the image."""
|
|
752
|
+
if self.image is not None and self.image._pil is not None:
|
|
753
|
+
# Convert the image to raw bytes
|
|
754
|
+
image_bytes = self.image._pil.tobytes()
|
|
755
|
+
|
|
756
|
+
# Create a hash object (e.g., SHA-256)
|
|
757
|
+
hasher = hashlib.sha256()
|
|
758
|
+
|
|
759
|
+
# Feed the image bytes into the hash object
|
|
760
|
+
hasher.update(image_bytes)
|
|
761
|
+
|
|
762
|
+
# Get the hexadecimal representation of the hash
|
|
763
|
+
return hasher.hexdigest()
|
|
764
|
+
|
|
765
|
+
return None
|
|
766
|
+
|
|
767
|
+
def export_to_markdown(
|
|
768
|
+
self,
|
|
769
|
+
doc: "DoclingDocument",
|
|
770
|
+
add_caption: bool = True,
|
|
771
|
+
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
772
|
+
image_placeholder: str = "<!-- image -->",
|
|
773
|
+
) -> str:
|
|
774
|
+
"""Export picture to Markdown format."""
|
|
775
|
+
default_response = "\n" + image_placeholder + "\n"
|
|
776
|
+
error_response = (
|
|
777
|
+
"\n<!-- 🖼️❌ Image not available. "
|
|
778
|
+
"Please use `PdfPipelineOptions(generate_picture_images=True)`"
|
|
779
|
+
" --> \n"
|
|
780
|
+
)
|
|
781
|
+
|
|
782
|
+
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
783
|
+
return default_response
|
|
784
|
+
|
|
785
|
+
elif image_mode == ImageRefMode.EMBEDDED:
|
|
786
|
+
|
|
787
|
+
# short-cut: we already have the image in base64
|
|
788
|
+
if (
|
|
789
|
+
isinstance(self.image, ImageRef)
|
|
790
|
+
and isinstance(self.image.uri, AnyUrl)
|
|
791
|
+
and self.image.uri.scheme == "data"
|
|
792
|
+
):
|
|
793
|
+
text = f"\n\n"
|
|
794
|
+
return text
|
|
795
|
+
|
|
796
|
+
# get the self.image._pil or crop it out of the page-image
|
|
797
|
+
img = self.get_image(doc)
|
|
798
|
+
|
|
799
|
+
if img is not None:
|
|
800
|
+
imgb64 = self._image_to_base64(img)
|
|
801
|
+
text = f"\n\n"
|
|
802
|
+
|
|
803
|
+
return text
|
|
804
|
+
else:
|
|
805
|
+
return error_response
|
|
806
|
+
|
|
807
|
+
elif image_mode == ImageRefMode.REFERENCED:
|
|
808
|
+
if not isinstance(self.image, ImageRef) or (
|
|
809
|
+
isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
|
|
810
|
+
):
|
|
811
|
+
return default_response
|
|
812
|
+
|
|
813
|
+
if (
|
|
814
|
+
isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "file"
|
|
815
|
+
) or isinstance(self.image.uri, Path):
|
|
816
|
+
text = f"\n})\n"
|
|
817
|
+
return text
|
|
818
|
+
|
|
819
|
+
else:
|
|
820
|
+
return default_response
|
|
821
|
+
|
|
822
|
+
else:
|
|
823
|
+
return default_response
|
|
824
|
+
|
|
825
|
+
def export_to_html(
|
|
826
|
+
self,
|
|
827
|
+
doc: "DoclingDocument",
|
|
828
|
+
add_caption: bool = True,
|
|
829
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
830
|
+
) -> str:
|
|
831
|
+
"""Export picture to HTML format."""
|
|
832
|
+
text = ""
|
|
833
|
+
if add_caption and len(self.captions):
|
|
834
|
+
text = self.caption_text(doc)
|
|
835
|
+
|
|
836
|
+
caption_text = ""
|
|
837
|
+
if len(text) > 0:
|
|
838
|
+
caption_text = f"<figcaption>{text}</figcaption>"
|
|
839
|
+
|
|
840
|
+
default_response = f"<figure>{caption_text}</figure>"
|
|
841
|
+
|
|
842
|
+
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
843
|
+
return default_response
|
|
844
|
+
|
|
845
|
+
elif image_mode == ImageRefMode.EMBEDDED:
|
|
846
|
+
# short-cut: we already have the image in base64
|
|
847
|
+
if (
|
|
848
|
+
isinstance(self.image, ImageRef)
|
|
849
|
+
and isinstance(self.image.uri, AnyUrl)
|
|
850
|
+
and self.image.uri.scheme == "data"
|
|
851
|
+
):
|
|
852
|
+
img_text = f'<img src="{self.image.uri}">'
|
|
853
|
+
return f"<figure>{caption_text}{img_text}</figure>"
|
|
854
|
+
|
|
855
|
+
# get the self.image._pil or crop it out of the page-image
|
|
856
|
+
img = self.get_image(doc)
|
|
857
|
+
|
|
858
|
+
if img is not None:
|
|
859
|
+
imgb64 = self._image_to_base64(img)
|
|
860
|
+
img_text = f'<img src="data:image/png;base64,{imgb64}">'
|
|
861
|
+
|
|
862
|
+
return f"<figure>{caption_text}{img_text}</figure>"
|
|
863
|
+
else:
|
|
864
|
+
return default_response
|
|
865
|
+
|
|
866
|
+
elif image_mode == ImageRefMode.REFERENCED:
|
|
867
|
+
|
|
868
|
+
if not isinstance(self.image, ImageRef) or (
|
|
869
|
+
isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
|
|
870
|
+
):
|
|
871
|
+
return default_response
|
|
872
|
+
|
|
873
|
+
if (
|
|
874
|
+
isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "file"
|
|
875
|
+
) or isinstance(self.image.uri, Path):
|
|
876
|
+
img_text = f'<img src="{str(self.image.uri)}">'
|
|
877
|
+
return f"<figure>{caption_text}{img_text}</figure>"
|
|
878
|
+
|
|
879
|
+
else:
|
|
880
|
+
return default_response
|
|
881
|
+
|
|
882
|
+
else:
|
|
883
|
+
return default_response
|
|
884
|
+
|
|
680
885
|
def export_to_document_tokens(
|
|
681
886
|
self,
|
|
682
887
|
doc: "DoclingDocument",
|
|
@@ -804,14 +1009,30 @@ class TableItem(FloatingItem):
|
|
|
804
1009
|
)
|
|
805
1010
|
return md_table
|
|
806
1011
|
|
|
807
|
-
def export_to_html(
|
|
1012
|
+
def export_to_html(
|
|
1013
|
+
self, doc: Optional["DoclingDocument"] = None, add_caption: bool = True
|
|
1014
|
+
) -> str:
|
|
808
1015
|
"""Export the table as html."""
|
|
1016
|
+
if doc is None:
|
|
1017
|
+
warnings.warn(
|
|
1018
|
+
"The `doc` argument will be mandatory in a future version. "
|
|
1019
|
+
"It must be provided to include a caption.",
|
|
1020
|
+
DeprecationWarning,
|
|
1021
|
+
)
|
|
1022
|
+
|
|
809
1023
|
body = ""
|
|
810
1024
|
nrows = self.data.num_rows
|
|
811
1025
|
ncols = self.data.num_cols
|
|
812
1026
|
|
|
813
|
-
|
|
1027
|
+
text = ""
|
|
1028
|
+
if doc is not None and add_caption and len(self.captions):
|
|
1029
|
+
text = self.caption_text(doc)
|
|
1030
|
+
|
|
1031
|
+
if len(self.data.table_cells) == 0:
|
|
814
1032
|
return ""
|
|
1033
|
+
|
|
1034
|
+
body = ""
|
|
1035
|
+
|
|
815
1036
|
for i in range(nrows):
|
|
816
1037
|
body += "<tr>"
|
|
817
1038
|
for j in range(ncols):
|
|
@@ -844,7 +1065,15 @@ class TableItem(FloatingItem):
|
|
|
844
1065
|
|
|
845
1066
|
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
846
1067
|
body += "</tr>"
|
|
847
|
-
|
|
1068
|
+
|
|
1069
|
+
if len(text) > 0 and len(body) > 0:
|
|
1070
|
+
body = f"<table><caption>{text}</caption><tbody>{body}</tbody></table>"
|
|
1071
|
+
elif len(text) == 0 and len(body) > 0:
|
|
1072
|
+
body = f"<table><tbody>{body}</tbody></table>"
|
|
1073
|
+
elif len(text) > 0 and len(body) == 0:
|
|
1074
|
+
body = f"<table><caption>{text}</caption></table>"
|
|
1075
|
+
else:
|
|
1076
|
+
body = "<table></table>"
|
|
848
1077
|
|
|
849
1078
|
return body
|
|
850
1079
|
|
|
@@ -981,6 +1210,23 @@ class PageItem(BaseModel):
|
|
|
981
1210
|
class DoclingDocument(BaseModel):
|
|
982
1211
|
"""DoclingDocument."""
|
|
983
1212
|
|
|
1213
|
+
_HTML_DEFAULT_HEAD: str = r"""<head>
|
|
1214
|
+
<meta charset="UTF-8">
|
|
1215
|
+
<style>
|
|
1216
|
+
table {
|
|
1217
|
+
border-collapse: separate;
|
|
1218
|
+
/* Maintain separate borders */
|
|
1219
|
+
border-spacing: 5px; /*
|
|
1220
|
+
Space between cells */
|
|
1221
|
+
width: 50%;
|
|
1222
|
+
}
|
|
1223
|
+
th, td {
|
|
1224
|
+
border: 1px solid black;
|
|
1225
|
+
/* Add lines etween cells */
|
|
1226
|
+
padding: 8px; }
|
|
1227
|
+
</style>
|
|
1228
|
+
</head>"""
|
|
1229
|
+
|
|
984
1230
|
schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
|
|
985
1231
|
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
|
|
986
1232
|
CURRENT_VERSION
|
|
@@ -1045,7 +1291,7 @@ class DoclingDocument(BaseModel):
|
|
|
1045
1291
|
prov: Optional[ProvenanceItem] = None,
|
|
1046
1292
|
parent: Optional[GroupItem] = None,
|
|
1047
1293
|
):
|
|
1048
|
-
"""
|
|
1294
|
+
"""add_list_item.
|
|
1049
1295
|
|
|
1050
1296
|
:param label: str:
|
|
1051
1297
|
:param text: str:
|
|
@@ -1088,7 +1334,7 @@ class DoclingDocument(BaseModel):
|
|
|
1088
1334
|
prov: Optional[ProvenanceItem] = None,
|
|
1089
1335
|
parent: Optional[GroupItem] = None,
|
|
1090
1336
|
):
|
|
1091
|
-
"""
|
|
1337
|
+
"""add_text.
|
|
1092
1338
|
|
|
1093
1339
|
:param label: str:
|
|
1094
1340
|
:param text: str:
|
|
@@ -1097,28 +1343,41 @@ class DoclingDocument(BaseModel):
|
|
|
1097
1343
|
:param parent: Optional[GroupItem]: (Default value = None)
|
|
1098
1344
|
|
|
1099
1345
|
"""
|
|
1100
|
-
|
|
1101
|
-
|
|
1346
|
+
# Catch a few cases that are in principle allowed
|
|
1347
|
+
# but that will create confusion down the road
|
|
1348
|
+
if label in [DocItemLabel.TITLE]:
|
|
1349
|
+
return self.add_title(text=text, orig=orig, prov=prov, parent=parent)
|
|
1102
1350
|
|
|
1103
|
-
|
|
1104
|
-
orig =
|
|
1351
|
+
elif label in [DocItemLabel.LIST_ITEM]:
|
|
1352
|
+
return self.add_list_item(text=text, orig=orig, prov=prov, parent=parent)
|
|
1105
1353
|
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
text_item = TextItem(
|
|
1109
|
-
label=label,
|
|
1110
|
-
text=text,
|
|
1111
|
-
orig=orig,
|
|
1112
|
-
self_ref=cref,
|
|
1113
|
-
parent=parent.get_ref(),
|
|
1114
|
-
)
|
|
1115
|
-
if prov:
|
|
1116
|
-
text_item.prov.append(prov)
|
|
1354
|
+
elif label in [DocItemLabel.SECTION_HEADER]:
|
|
1355
|
+
return self.add_heading(text=text, orig=orig, prov=prov, parent=parent)
|
|
1117
1356
|
|
|
1118
|
-
|
|
1119
|
-
parent.children.append(RefItem(cref=cref))
|
|
1357
|
+
else:
|
|
1120
1358
|
|
|
1121
|
-
|
|
1359
|
+
if not parent:
|
|
1360
|
+
parent = self.body
|
|
1361
|
+
|
|
1362
|
+
if not orig:
|
|
1363
|
+
orig = text
|
|
1364
|
+
|
|
1365
|
+
text_index = len(self.texts)
|
|
1366
|
+
cref = f"#/texts/{text_index}"
|
|
1367
|
+
text_item = TextItem(
|
|
1368
|
+
label=label,
|
|
1369
|
+
text=text,
|
|
1370
|
+
orig=orig,
|
|
1371
|
+
self_ref=cref,
|
|
1372
|
+
parent=parent.get_ref(),
|
|
1373
|
+
)
|
|
1374
|
+
if prov:
|
|
1375
|
+
text_item.prov.append(prov)
|
|
1376
|
+
|
|
1377
|
+
self.texts.append(text_item)
|
|
1378
|
+
parent.children.append(RefItem(cref=cref))
|
|
1379
|
+
|
|
1380
|
+
return text_item
|
|
1122
1381
|
|
|
1123
1382
|
def add_table(
|
|
1124
1383
|
self,
|
|
@@ -1170,7 +1429,6 @@ class DoclingDocument(BaseModel):
|
|
|
1170
1429
|
:param RefItem]]: (Default value = None)
|
|
1171
1430
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1172
1431
|
:param parent: Optional[GroupItem]: (Default value = None)
|
|
1173
|
-
|
|
1174
1432
|
"""
|
|
1175
1433
|
if not parent:
|
|
1176
1434
|
parent = self.body
|
|
@@ -1195,6 +1453,43 @@ class DoclingDocument(BaseModel):
|
|
|
1195
1453
|
|
|
1196
1454
|
return fig_item
|
|
1197
1455
|
|
|
1456
|
+
def add_title(
|
|
1457
|
+
self,
|
|
1458
|
+
text: str,
|
|
1459
|
+
orig: Optional[str] = None,
|
|
1460
|
+
prov: Optional[ProvenanceItem] = None,
|
|
1461
|
+
parent: Optional[GroupItem] = None,
|
|
1462
|
+
):
|
|
1463
|
+
"""add_title.
|
|
1464
|
+
|
|
1465
|
+
:param text: str:
|
|
1466
|
+
:param orig: Optional[str]: (Default value = None)
|
|
1467
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1468
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
1469
|
+
"""
|
|
1470
|
+
if not parent:
|
|
1471
|
+
parent = self.body
|
|
1472
|
+
|
|
1473
|
+
if not orig:
|
|
1474
|
+
orig = text
|
|
1475
|
+
|
|
1476
|
+
text_index = len(self.texts)
|
|
1477
|
+
cref = f"#/texts/{text_index}"
|
|
1478
|
+
text_item = TextItem(
|
|
1479
|
+
label=DocItemLabel.TITLE,
|
|
1480
|
+
text=text,
|
|
1481
|
+
orig=orig,
|
|
1482
|
+
self_ref=cref,
|
|
1483
|
+
parent=parent.get_ref(),
|
|
1484
|
+
)
|
|
1485
|
+
if prov:
|
|
1486
|
+
text_item.prov.append(prov)
|
|
1487
|
+
|
|
1488
|
+
self.texts.append(text_item)
|
|
1489
|
+
parent.children.append(RefItem(cref=cref))
|
|
1490
|
+
|
|
1491
|
+
return text_item
|
|
1492
|
+
|
|
1198
1493
|
def add_heading(
|
|
1199
1494
|
self,
|
|
1200
1495
|
text: str,
|
|
@@ -1211,7 +1506,6 @@ class DoclingDocument(BaseModel):
|
|
|
1211
1506
|
:param level: LevelNumber: (Default value = 1)
|
|
1212
1507
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1213
1508
|
:param parent: Optional[GroupItem]: (Default value = None)
|
|
1214
|
-
|
|
1215
1509
|
"""
|
|
1216
1510
|
if not parent:
|
|
1217
1511
|
parent = self.body
|
|
@@ -1297,17 +1591,220 @@ class DoclingDocument(BaseModel):
|
|
|
1297
1591
|
page_no=page_no,
|
|
1298
1592
|
)
|
|
1299
1593
|
|
|
1594
|
+
def _clear_picture_pil_cache(self):
|
|
1595
|
+
"""Clear cache storage of all images."""
|
|
1596
|
+
for item, level in self.iterate_items(with_groups=False):
|
|
1597
|
+
if isinstance(item, PictureItem):
|
|
1598
|
+
if item.image is not None and item.image._pil is not None:
|
|
1599
|
+
item.image._pil.close()
|
|
1600
|
+
|
|
1601
|
+
def _list_images_on_disk(self) -> List[Path]:
|
|
1602
|
+
"""List all images on disk."""
|
|
1603
|
+
result: List[Path] = []
|
|
1604
|
+
|
|
1605
|
+
for item, level in self.iterate_items(with_groups=False):
|
|
1606
|
+
if isinstance(item, PictureItem):
|
|
1607
|
+
if item.image is not None:
|
|
1608
|
+
if (
|
|
1609
|
+
isinstance(item.image.uri, AnyUrl)
|
|
1610
|
+
and item.image.uri.scheme == "file"
|
|
1611
|
+
and item.image.uri.path is not None
|
|
1612
|
+
):
|
|
1613
|
+
local_path = Path(unquote(item.image.uri.path))
|
|
1614
|
+
result.append(local_path)
|
|
1615
|
+
elif isinstance(item.image.uri, Path):
|
|
1616
|
+
result.append(item.image.uri)
|
|
1617
|
+
|
|
1618
|
+
return result
|
|
1619
|
+
|
|
1620
|
+
def _with_embedded_pictures(self) -> "DoclingDocument":
|
|
1621
|
+
"""Document with embedded images.
|
|
1622
|
+
|
|
1623
|
+
Creates a copy of this document where all pictures referenced
|
|
1624
|
+
through a file URI are turned into base64 embedded form.
|
|
1625
|
+
"""
|
|
1626
|
+
result: DoclingDocument = copy.deepcopy(self)
|
|
1627
|
+
|
|
1628
|
+
for ix, (item, level) in enumerate(result.iterate_items(with_groups=True)):
|
|
1629
|
+
if isinstance(item, PictureItem):
|
|
1630
|
+
|
|
1631
|
+
if item.image is not None:
|
|
1632
|
+
if (
|
|
1633
|
+
isinstance(item.image.uri, AnyUrl)
|
|
1634
|
+
and item.image.uri.scheme == "file"
|
|
1635
|
+
):
|
|
1636
|
+
assert isinstance(item.image.uri.path, str)
|
|
1637
|
+
tmp_image = PILImage.open(str(unquote(item.image.uri.path)))
|
|
1638
|
+
item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
|
|
1639
|
+
|
|
1640
|
+
elif isinstance(item.image.uri, Path):
|
|
1641
|
+
tmp_image = PILImage.open(str(item.image.uri))
|
|
1642
|
+
item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
|
|
1643
|
+
|
|
1644
|
+
return result
|
|
1645
|
+
|
|
1646
|
+
def _with_pictures_refs(
|
|
1647
|
+
self, image_dir: Path, reference_path: Optional[Path] = None
|
|
1648
|
+
) -> "DoclingDocument":
|
|
1649
|
+
"""Document with images as refs.
|
|
1650
|
+
|
|
1651
|
+
Creates a copy of this document where all picture data is
|
|
1652
|
+
saved to image_dir and referenced through file URIs.
|
|
1653
|
+
"""
|
|
1654
|
+
result: DoclingDocument = copy.deepcopy(self)
|
|
1655
|
+
|
|
1656
|
+
img_count = 0
|
|
1657
|
+
image_dir.mkdir(parents=True, exist_ok=True)
|
|
1658
|
+
|
|
1659
|
+
if image_dir.is_dir():
|
|
1660
|
+
for item, level in result.iterate_items(with_groups=False):
|
|
1661
|
+
if isinstance(item, PictureItem):
|
|
1662
|
+
|
|
1663
|
+
if (
|
|
1664
|
+
item.image is not None
|
|
1665
|
+
and isinstance(item.image.uri, AnyUrl)
|
|
1666
|
+
and item.image.uri.scheme == "data"
|
|
1667
|
+
and item.image.pil_image is not None
|
|
1668
|
+
):
|
|
1669
|
+
img = item.image.pil_image
|
|
1670
|
+
|
|
1671
|
+
hexhash = item._image_to_hexhash()
|
|
1672
|
+
|
|
1673
|
+
# loc_path = image_dir / f"image_{img_count:06}.png"
|
|
1674
|
+
if hexhash is not None:
|
|
1675
|
+
loc_path = image_dir / f"image_{img_count:06}_{hexhash}.png"
|
|
1676
|
+
|
|
1677
|
+
img.save(loc_path)
|
|
1678
|
+
if reference_path is not None:
|
|
1679
|
+
obj_path = relative_path(
|
|
1680
|
+
reference_path.resolve(), loc_path.resolve()
|
|
1681
|
+
)
|
|
1682
|
+
else:
|
|
1683
|
+
obj_path = loc_path
|
|
1684
|
+
|
|
1685
|
+
item.image.uri = Path(obj_path)
|
|
1686
|
+
|
|
1687
|
+
# if item.image._pil is not None:
|
|
1688
|
+
# item.image._pil.close()
|
|
1689
|
+
|
|
1690
|
+
img_count += 1
|
|
1691
|
+
|
|
1692
|
+
return result
|
|
1693
|
+
|
|
1300
1694
|
def print_element_tree(self):
|
|
1301
|
-
"""
|
|
1695
|
+
"""Print_element_tree."""
|
|
1302
1696
|
for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
|
|
1303
1697
|
if isinstance(item, GroupItem):
|
|
1304
1698
|
print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
|
|
1305
1699
|
elif isinstance(item, DocItem):
|
|
1306
1700
|
print(" " * level, f"{ix}: {item.label.value}")
|
|
1307
1701
|
|
|
1308
|
-
def
|
|
1309
|
-
"""
|
|
1310
|
-
|
|
1702
|
+
def export_to_element_tree(self) -> str:
|
|
1703
|
+
"""Export_to_element_tree."""
|
|
1704
|
+
texts = []
|
|
1705
|
+
for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
|
|
1706
|
+
if isinstance(item, GroupItem):
|
|
1707
|
+
texts.append(
|
|
1708
|
+
" " * level + f"{ix}: {item.label.value} with name={item.name}"
|
|
1709
|
+
)
|
|
1710
|
+
elif isinstance(item, DocItem):
|
|
1711
|
+
texts.append(" " * level + f"{ix}: {item.label.value}")
|
|
1712
|
+
|
|
1713
|
+
return "\n".join(texts)
|
|
1714
|
+
|
|
1715
|
+
def save_as_json(
|
|
1716
|
+
self,
|
|
1717
|
+
filename: Path,
|
|
1718
|
+
artifacts_dir: Optional[Path] = None,
|
|
1719
|
+
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
1720
|
+
indent: int = 2,
|
|
1721
|
+
):
|
|
1722
|
+
"""Save as json."""
|
|
1723
|
+
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
1724
|
+
|
|
1725
|
+
if image_mode == ImageRefMode.REFERENCED:
|
|
1726
|
+
os.makedirs(artifacts_dir, exist_ok=True)
|
|
1727
|
+
|
|
1728
|
+
new_doc = self._make_copy_with_refmode(
|
|
1729
|
+
artifacts_dir, image_mode, reference_path=reference_path
|
|
1730
|
+
)
|
|
1731
|
+
|
|
1732
|
+
out = new_doc.export_to_dict()
|
|
1733
|
+
with open(filename, "w") as fw:
|
|
1734
|
+
json.dump(out, fw, indent=indent)
|
|
1735
|
+
|
|
1736
|
+
def save_as_yaml(
|
|
1737
|
+
self,
|
|
1738
|
+
filename: Path,
|
|
1739
|
+
artifacts_dir: Optional[Path] = None,
|
|
1740
|
+
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
1741
|
+
default_flow_style: bool = False,
|
|
1742
|
+
):
|
|
1743
|
+
"""Save as yaml."""
|
|
1744
|
+
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
1745
|
+
|
|
1746
|
+
if image_mode == ImageRefMode.REFERENCED:
|
|
1747
|
+
os.makedirs(artifacts_dir, exist_ok=True)
|
|
1748
|
+
|
|
1749
|
+
new_doc = self._make_copy_with_refmode(
|
|
1750
|
+
artifacts_dir, image_mode, reference_path=reference_path
|
|
1751
|
+
)
|
|
1752
|
+
|
|
1753
|
+
out = new_doc.export_to_dict()
|
|
1754
|
+
with open(filename, "w") as fw:
|
|
1755
|
+
yaml.dump(out, fw, default_flow_style=default_flow_style)
|
|
1756
|
+
|
|
1757
|
+
def export_to_dict(
|
|
1758
|
+
self,
|
|
1759
|
+
mode: str = "json",
|
|
1760
|
+
by_alias: bool = True,
|
|
1761
|
+
exclude_none: bool = True,
|
|
1762
|
+
) -> Dict:
|
|
1763
|
+
"""Export to dict."""
|
|
1764
|
+
out = self.model_dump(mode=mode, by_alias=by_alias, exclude_none=exclude_none)
|
|
1765
|
+
|
|
1766
|
+
return out
|
|
1767
|
+
|
|
1768
|
+
def save_as_markdown(
|
|
1769
|
+
self,
|
|
1770
|
+
filename: Path,
|
|
1771
|
+
artifacts_dir: Optional[Path] = None,
|
|
1772
|
+
delim: str = "\n",
|
|
1773
|
+
from_element: int = 0,
|
|
1774
|
+
to_element: int = sys.maxsize,
|
|
1775
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
1776
|
+
strict_text: bool = False,
|
|
1777
|
+
image_placeholder: str = "<!-- image -->",
|
|
1778
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
1779
|
+
indent: int = 4,
|
|
1780
|
+
text_width: int = -1,
|
|
1781
|
+
page_no: Optional[int] = None,
|
|
1782
|
+
):
|
|
1783
|
+
"""Save to markdown."""
|
|
1784
|
+
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
1785
|
+
|
|
1786
|
+
if image_mode == ImageRefMode.REFERENCED:
|
|
1787
|
+
os.makedirs(artifacts_dir, exist_ok=True)
|
|
1788
|
+
|
|
1789
|
+
new_doc = self._make_copy_with_refmode(
|
|
1790
|
+
artifacts_dir, image_mode, reference_path=reference_path
|
|
1791
|
+
)
|
|
1792
|
+
|
|
1793
|
+
md_out = new_doc.export_to_markdown(
|
|
1794
|
+
delim=delim,
|
|
1795
|
+
from_element=from_element,
|
|
1796
|
+
to_element=to_element,
|
|
1797
|
+
labels=labels,
|
|
1798
|
+
strict_text=strict_text,
|
|
1799
|
+
image_placeholder=image_placeholder,
|
|
1800
|
+
image_mode=image_mode,
|
|
1801
|
+
indent=indent,
|
|
1802
|
+
text_width=text_width,
|
|
1803
|
+
page_no=page_no,
|
|
1804
|
+
)
|
|
1805
|
+
|
|
1806
|
+
with open(filename, "w") as fw:
|
|
1807
|
+
fw.write(md_out)
|
|
1311
1808
|
|
|
1312
1809
|
def export_to_markdown( # noqa: C901
|
|
1313
1810
|
self,
|
|
@@ -1461,22 +1958,13 @@ class DoclingDocument(BaseModel):
|
|
|
1461
1958
|
in_list = False
|
|
1462
1959
|
mdtexts.append(item.caption_text(self))
|
|
1463
1960
|
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
)
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
elif image_mode == ImageRefMode.EMBEDDED and not isinstance(
|
|
1472
|
-
item.image, ImageRef
|
|
1473
|
-
):
|
|
1474
|
-
text = (
|
|
1475
|
-
"<!-- 🖼️❌ Image not available. "
|
|
1476
|
-
"Please use `PdfPipelineOptions(generate_picture_images=True)`"
|
|
1477
|
-
" --> "
|
|
1478
|
-
)
|
|
1479
|
-
mdtexts.append(text)
|
|
1961
|
+
line = item.export_to_markdown(
|
|
1962
|
+
doc=self,
|
|
1963
|
+
image_placeholder=image_placeholder,
|
|
1964
|
+
image_mode=image_mode,
|
|
1965
|
+
)
|
|
1966
|
+
|
|
1967
|
+
mdtexts.append(line)
|
|
1480
1968
|
|
|
1481
1969
|
elif isinstance(item, DocItem) and item.label in labels:
|
|
1482
1970
|
in_list = False
|
|
@@ -1518,11 +2006,288 @@ class DoclingDocument(BaseModel):
|
|
|
1518
2006
|
image_placeholder="",
|
|
1519
2007
|
)
|
|
1520
2008
|
|
|
1521
|
-
def
|
|
2009
|
+
def save_as_html(
|
|
1522
2010
|
self,
|
|
2011
|
+
filename: Path,
|
|
2012
|
+
artifacts_dir: Optional[Path] = None,
|
|
2013
|
+
from_element: int = 0,
|
|
2014
|
+
to_element: int = sys.maxsize,
|
|
2015
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
2016
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2017
|
+
page_no: Optional[int] = None,
|
|
2018
|
+
html_lang: str = "en",
|
|
2019
|
+
html_head: str = _HTML_DEFAULT_HEAD,
|
|
2020
|
+
):
|
|
2021
|
+
"""Save to HTML."""
|
|
2022
|
+
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
2023
|
+
|
|
2024
|
+
if image_mode == ImageRefMode.REFERENCED:
|
|
2025
|
+
os.makedirs(artifacts_dir, exist_ok=True)
|
|
2026
|
+
|
|
2027
|
+
new_doc = self._make_copy_with_refmode(
|
|
2028
|
+
artifacts_dir, image_mode, reference_path=reference_path
|
|
2029
|
+
)
|
|
2030
|
+
|
|
2031
|
+
html_out = new_doc.export_to_html(
|
|
2032
|
+
from_element=from_element,
|
|
2033
|
+
to_element=to_element,
|
|
2034
|
+
labels=labels,
|
|
2035
|
+
image_mode=image_mode,
|
|
2036
|
+
page_no=page_no,
|
|
2037
|
+
html_lang=html_lang,
|
|
2038
|
+
html_head=html_head,
|
|
2039
|
+
)
|
|
2040
|
+
|
|
2041
|
+
with open(filename, "w") as fw:
|
|
2042
|
+
fw.write(html_out)
|
|
2043
|
+
|
|
2044
|
+
def _get_output_paths(
|
|
2045
|
+
self, filename: Path, artifacts_dir: Optional[Path] = None
|
|
2046
|
+
) -> Tuple[Path, Optional[Path]]:
|
|
2047
|
+
if artifacts_dir is None:
|
|
2048
|
+
# Remove the extension and add '_pictures'
|
|
2049
|
+
artifacts_dir = filename.with_suffix("")
|
|
2050
|
+
artifacts_dir = artifacts_dir.with_name(artifacts_dir.name + "_artifacts")
|
|
2051
|
+
if artifacts_dir.is_absolute():
|
|
2052
|
+
reference_path = None
|
|
2053
|
+
else:
|
|
2054
|
+
reference_path = filename.parent
|
|
2055
|
+
return artifacts_dir, reference_path
|
|
2056
|
+
|
|
2057
|
+
def _make_copy_with_refmode(
|
|
2058
|
+
self,
|
|
2059
|
+
artifacts_dir: Path,
|
|
2060
|
+
image_mode: ImageRefMode,
|
|
2061
|
+
reference_path: Optional[Path] = None,
|
|
2062
|
+
):
|
|
2063
|
+
new_doc = None
|
|
2064
|
+
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
2065
|
+
new_doc = self
|
|
2066
|
+
elif image_mode == ImageRefMode.REFERENCED:
|
|
2067
|
+
new_doc = self._with_pictures_refs(
|
|
2068
|
+
image_dir=artifacts_dir, reference_path=reference_path
|
|
2069
|
+
)
|
|
2070
|
+
elif image_mode == ImageRefMode.EMBEDDED:
|
|
2071
|
+
new_doc = self._with_embedded_pictures()
|
|
2072
|
+
else:
|
|
2073
|
+
raise ValueError("Unsupported ImageRefMode")
|
|
2074
|
+
return new_doc
|
|
2075
|
+
|
|
2076
|
+
def export_to_html( # noqa: C901
|
|
2077
|
+
self,
|
|
2078
|
+
from_element: int = 0,
|
|
2079
|
+
to_element: int = sys.maxsize,
|
|
2080
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
2081
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2082
|
+
page_no: Optional[int] = None,
|
|
2083
|
+
html_lang: str = "en",
|
|
2084
|
+
html_head: str = _HTML_DEFAULT_HEAD,
|
|
2085
|
+
) -> str:
|
|
2086
|
+
r"""Serialize to HTML."""
|
|
2087
|
+
|
|
2088
|
+
def close_lists(
|
|
2089
|
+
curr_level: int,
|
|
2090
|
+
prev_level: int,
|
|
2091
|
+
in_ordered_list: List[bool],
|
|
2092
|
+
html_texts: list[str],
|
|
2093
|
+
):
|
|
2094
|
+
|
|
2095
|
+
if len(in_ordered_list) == 0:
|
|
2096
|
+
return (in_ordered_list, html_texts)
|
|
2097
|
+
|
|
2098
|
+
while curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2099
|
+
if in_ordered_list[-1]:
|
|
2100
|
+
html_texts.append("</ol>")
|
|
2101
|
+
else:
|
|
2102
|
+
html_texts.append("</ul>")
|
|
2103
|
+
|
|
2104
|
+
prev_level -= 1
|
|
2105
|
+
in_ordered_list.pop() # = in_ordered_list[:-1]
|
|
2106
|
+
|
|
2107
|
+
return (in_ordered_list, html_texts)
|
|
2108
|
+
|
|
2109
|
+
head_lines = ["<!DOCTYPE html>", f'<html lang="{html_lang}">', html_head]
|
|
2110
|
+
html_texts: list[str] = []
|
|
2111
|
+
|
|
2112
|
+
prev_level = 0 # Track the previous item's level
|
|
2113
|
+
|
|
2114
|
+
in_ordered_list: List[bool] = [] # False
|
|
2115
|
+
|
|
2116
|
+
for ix, (item, curr_level) in enumerate(
|
|
2117
|
+
self.iterate_items(self.body, with_groups=True, page_no=page_no)
|
|
2118
|
+
):
|
|
2119
|
+
# If we've moved to a lower level, we're exiting one or more groups
|
|
2120
|
+
if curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2121
|
+
# Calculate how many levels we've exited
|
|
2122
|
+
# level_difference = previous_level - level
|
|
2123
|
+
# Decrement list_nesting_level for each list group we've exited
|
|
2124
|
+
# list_nesting_level = max(0, list_nesting_level - level_difference)
|
|
2125
|
+
|
|
2126
|
+
in_ordered_list, html_texts = close_lists(
|
|
2127
|
+
curr_level=curr_level,
|
|
2128
|
+
prev_level=prev_level,
|
|
2129
|
+
in_ordered_list=in_ordered_list,
|
|
2130
|
+
html_texts=html_texts,
|
|
2131
|
+
)
|
|
2132
|
+
|
|
2133
|
+
prev_level = curr_level # Update previous_level for next iteration
|
|
2134
|
+
|
|
2135
|
+
if ix < from_element or to_element <= ix:
|
|
2136
|
+
continue # skip as many items as you want
|
|
2137
|
+
|
|
2138
|
+
if (isinstance(item, DocItem)) and (item.label not in labels):
|
|
2139
|
+
continue # skip any label that is not whitelisted
|
|
2140
|
+
|
|
2141
|
+
if isinstance(item, GroupItem) and item.label in [
|
|
2142
|
+
GroupLabel.ORDERED_LIST,
|
|
2143
|
+
]:
|
|
2144
|
+
|
|
2145
|
+
text = "<ol>"
|
|
2146
|
+
html_texts.append(text.strip())
|
|
2147
|
+
|
|
2148
|
+
# Increment list nesting level when entering a new list
|
|
2149
|
+
in_ordered_list.append(True)
|
|
2150
|
+
|
|
2151
|
+
elif isinstance(item, GroupItem) and item.label in [
|
|
2152
|
+
GroupLabel.LIST,
|
|
2153
|
+
]:
|
|
2154
|
+
|
|
2155
|
+
text = "<ul>"
|
|
2156
|
+
html_texts.append(text.strip())
|
|
2157
|
+
|
|
2158
|
+
# Increment list nesting level when entering a new list
|
|
2159
|
+
in_ordered_list.append(False)
|
|
2160
|
+
|
|
2161
|
+
elif isinstance(item, GroupItem):
|
|
2162
|
+
continue
|
|
2163
|
+
|
|
2164
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
2165
|
+
|
|
2166
|
+
text = f"<h1>{item.text}</h1>"
|
|
2167
|
+
html_texts.append(text.strip())
|
|
2168
|
+
|
|
2169
|
+
elif isinstance(item, SectionHeaderItem):
|
|
2170
|
+
|
|
2171
|
+
section_level: int = item.level + 1
|
|
2172
|
+
|
|
2173
|
+
text = f"<h{(section_level)}>{item.text}</h{(section_level)}>"
|
|
2174
|
+
html_texts.append(text.strip())
|
|
2175
|
+
|
|
2176
|
+
elif isinstance(item, TextItem) and item.label in [
|
|
2177
|
+
DocItemLabel.SECTION_HEADER
|
|
2178
|
+
]:
|
|
2179
|
+
|
|
2180
|
+
section_level = curr_level
|
|
2181
|
+
|
|
2182
|
+
if section_level <= 1:
|
|
2183
|
+
section_level = 2
|
|
2184
|
+
|
|
2185
|
+
if section_level >= 6:
|
|
2186
|
+
section_level = 6
|
|
2187
|
+
|
|
2188
|
+
text = f"<h{section_level}>{item.text}</h{section_level}>"
|
|
2189
|
+
html_texts.append(text.strip())
|
|
2190
|
+
|
|
2191
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
|
|
2192
|
+
|
|
2193
|
+
text = f"<pre>{item.text}</pre>"
|
|
2194
|
+
html_texts.append(text)
|
|
2195
|
+
|
|
2196
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
2197
|
+
# captions are printed in picture and table ... skipping for now
|
|
2198
|
+
continue
|
|
2199
|
+
|
|
2200
|
+
elif isinstance(item, ListItem):
|
|
2201
|
+
|
|
2202
|
+
text = f"<li>{item.text}</li>"
|
|
2203
|
+
html_texts.append(text)
|
|
2204
|
+
|
|
2205
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2206
|
+
|
|
2207
|
+
text = f"<li>{item.text}</li>"
|
|
2208
|
+
html_texts.append(text)
|
|
2209
|
+
|
|
2210
|
+
elif isinstance(item, TextItem) and item.label in labels:
|
|
2211
|
+
|
|
2212
|
+
text = f"<p>{item.text}</p>"
|
|
2213
|
+
html_texts.append(text.strip())
|
|
2214
|
+
|
|
2215
|
+
elif isinstance(item, TableItem):
|
|
2216
|
+
|
|
2217
|
+
text = item.export_to_html(doc=self, add_caption=True)
|
|
2218
|
+
html_texts.append(text)
|
|
2219
|
+
|
|
2220
|
+
elif isinstance(item, PictureItem):
|
|
2221
|
+
|
|
2222
|
+
html_texts.append(
|
|
2223
|
+
item.export_to_html(
|
|
2224
|
+
doc=self, add_caption=True, image_mode=image_mode
|
|
2225
|
+
)
|
|
2226
|
+
)
|
|
2227
|
+
|
|
2228
|
+
elif isinstance(item, DocItem) and item.label in labels:
|
|
2229
|
+
continue
|
|
2230
|
+
|
|
2231
|
+
html_texts.append("</html>")
|
|
2232
|
+
|
|
2233
|
+
lines = []
|
|
2234
|
+
lines.extend(head_lines)
|
|
2235
|
+
for i, line in enumerate(html_texts):
|
|
2236
|
+
lines.append(line.replace("\n", "<br>"))
|
|
2237
|
+
|
|
2238
|
+
delim = "\n"
|
|
2239
|
+
html_text = (delim.join(lines)).strip()
|
|
2240
|
+
|
|
2241
|
+
return html_text
|
|
2242
|
+
|
|
2243
|
+
def save_as_document_tokens(
|
|
2244
|
+
self,
|
|
2245
|
+
filename: Path,
|
|
1523
2246
|
delim: str = "\n\n",
|
|
1524
2247
|
from_element: int = 0,
|
|
1525
|
-
to_element:
|
|
2248
|
+
to_element: int = sys.maxsize,
|
|
2249
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
2250
|
+
xsize: int = 100,
|
|
2251
|
+
ysize: int = 100,
|
|
2252
|
+
add_location: bool = True,
|
|
2253
|
+
add_content: bool = True,
|
|
2254
|
+
add_page_index: bool = True,
|
|
2255
|
+
# table specific flags
|
|
2256
|
+
add_table_cell_location: bool = False,
|
|
2257
|
+
add_table_cell_label: bool = True,
|
|
2258
|
+
add_table_cell_text: bool = True,
|
|
2259
|
+
# specifics
|
|
2260
|
+
page_no: Optional[int] = None,
|
|
2261
|
+
with_groups: bool = True,
|
|
2262
|
+
):
|
|
2263
|
+
r"""Save the document content to a DocumentToken format."""
|
|
2264
|
+
out = self.export_to_document_tokens(
|
|
2265
|
+
delim=delim,
|
|
2266
|
+
from_element=from_element,
|
|
2267
|
+
to_element=to_element,
|
|
2268
|
+
labels=labels,
|
|
2269
|
+
xsize=xsize,
|
|
2270
|
+
ysize=ysize,
|
|
2271
|
+
add_location=add_location,
|
|
2272
|
+
add_content=add_content,
|
|
2273
|
+
add_page_index=add_page_index,
|
|
2274
|
+
# table specific flags
|
|
2275
|
+
add_table_cell_location=add_table_cell_location,
|
|
2276
|
+
add_table_cell_label=add_table_cell_label,
|
|
2277
|
+
add_table_cell_text=add_table_cell_text,
|
|
2278
|
+
# specifics
|
|
2279
|
+
page_no=page_no,
|
|
2280
|
+
with_groups=with_groups,
|
|
2281
|
+
)
|
|
2282
|
+
|
|
2283
|
+
with open(filename, "w") as fw:
|
|
2284
|
+
fw.write(out)
|
|
2285
|
+
|
|
2286
|
+
def export_to_document_tokens(
|
|
2287
|
+
self,
|
|
2288
|
+
delim: str = "\n",
|
|
2289
|
+
from_element: int = 0,
|
|
2290
|
+
to_element: int = sys.maxsize,
|
|
1526
2291
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
1527
2292
|
xsize: int = 100,
|
|
1528
2293
|
ysize: int = 100,
|
|
@@ -1533,8 +2298,12 @@ class DoclingDocument(BaseModel):
|
|
|
1533
2298
|
add_table_cell_location: bool = False,
|
|
1534
2299
|
add_table_cell_label: bool = True,
|
|
1535
2300
|
add_table_cell_text: bool = True,
|
|
2301
|
+
# specifics
|
|
2302
|
+
page_no: Optional[int] = None,
|
|
2303
|
+
with_groups: bool = True,
|
|
2304
|
+
newline: bool = True,
|
|
1536
2305
|
) -> str:
|
|
1537
|
-
r"""Exports the document content to
|
|
2306
|
+
r"""Exports the document content to a DocumentToken format.
|
|
1538
2307
|
|
|
1539
2308
|
Operates on a slice of the document's body as defined through arguments
|
|
1540
2309
|
from_element and to_element; defaulting to the whole main_text.
|
|
@@ -1554,44 +2323,102 @@ class DoclingDocument(BaseModel):
|
|
|
1554
2323
|
:returns: The content of the document formatted as a DocTags string.
|
|
1555
2324
|
:rtype: str
|
|
1556
2325
|
"""
|
|
1557
|
-
new_line = ""
|
|
1558
|
-
if delim:
|
|
1559
|
-
new_line = "\n"
|
|
1560
2326
|
|
|
1561
|
-
|
|
2327
|
+
def close_lists(
|
|
2328
|
+
curr_level: int,
|
|
2329
|
+
prev_level: int,
|
|
2330
|
+
in_ordered_list: List[bool],
|
|
2331
|
+
result: str,
|
|
2332
|
+
delim: str,
|
|
2333
|
+
):
|
|
2334
|
+
|
|
2335
|
+
if len(in_ordered_list) == 0:
|
|
2336
|
+
return (in_ordered_list, result)
|
|
2337
|
+
|
|
2338
|
+
while curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2339
|
+
if in_ordered_list[-1]:
|
|
2340
|
+
result += f"</ordered_list>{delim}"
|
|
2341
|
+
else:
|
|
2342
|
+
result += f"</unordered_list>{delim}"
|
|
1562
2343
|
|
|
1563
|
-
|
|
2344
|
+
prev_level -= 1
|
|
2345
|
+
in_ordered_list.pop() # = in_ordered_list[:-1]
|
|
2346
|
+
|
|
2347
|
+
return (in_ordered_list, result)
|
|
2348
|
+
|
|
2349
|
+
if newline:
|
|
2350
|
+
delim = "\n"
|
|
2351
|
+
else:
|
|
2352
|
+
delim = ""
|
|
1564
2353
|
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
2354
|
+
prev_level = 0 # Track the previous item's level
|
|
2355
|
+
|
|
2356
|
+
in_ordered_list: List[bool] = [] # False
|
|
2357
|
+
|
|
2358
|
+
result = f"{DocumentToken.BEG_DOCUMENT.value}{delim}"
|
|
2359
|
+
|
|
2360
|
+
for ix, (item, curr_level) in enumerate(
|
|
2361
|
+
self.iterate_items(self.body, with_groups=True)
|
|
2362
|
+
):
|
|
2363
|
+
|
|
2364
|
+
# If we've moved to a lower level, we're exiting one or more groups
|
|
2365
|
+
if curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2366
|
+
# Calculate how many levels we've exited
|
|
2367
|
+
# level_difference = previous_level - level
|
|
2368
|
+
# Decrement list_nesting_level for each list group we've exited
|
|
2369
|
+
# list_nesting_level = max(0, list_nesting_level - level_difference)
|
|
2370
|
+
|
|
2371
|
+
in_ordered_list, result = close_lists(
|
|
2372
|
+
curr_level=curr_level,
|
|
2373
|
+
prev_level=prev_level,
|
|
2374
|
+
in_ordered_list=in_ordered_list,
|
|
2375
|
+
result=result,
|
|
2376
|
+
delim=delim,
|
|
2377
|
+
)
|
|
2378
|
+
|
|
2379
|
+
prev_level = curr_level # Update previous_level for next iteration
|
|
2380
|
+
|
|
2381
|
+
if ix < from_element or to_element <= ix:
|
|
1569
2382
|
continue # skip as many items as you want
|
|
1570
2383
|
|
|
1571
|
-
if
|
|
1572
|
-
|
|
2384
|
+
if (isinstance(item, DocItem)) and (item.label not in labels):
|
|
2385
|
+
continue # skip any label that is not whitelisted
|
|
1573
2386
|
|
|
1574
|
-
if
|
|
1575
|
-
|
|
2387
|
+
if isinstance(item, GroupItem) and item.label in [
|
|
2388
|
+
GroupLabel.ORDERED_LIST,
|
|
2389
|
+
]:
|
|
1576
2390
|
|
|
1577
|
-
|
|
2391
|
+
result += f"<ordered_list>{delim}"
|
|
2392
|
+
in_ordered_list.append(True)
|
|
1578
2393
|
|
|
1579
|
-
|
|
2394
|
+
elif isinstance(item, GroupItem) and item.label in [
|
|
2395
|
+
GroupLabel.LIST,
|
|
2396
|
+
]:
|
|
1580
2397
|
|
|
1581
|
-
|
|
2398
|
+
result += f"<unordered_list>{delim}"
|
|
2399
|
+
in_ordered_list.append(False)
|
|
1582
2400
|
|
|
1583
|
-
|
|
1584
|
-
|
|
2401
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
2402
|
+
# captions are printed in picture and table ... skipping for now
|
|
2403
|
+
continue
|
|
1585
2404
|
|
|
1586
|
-
|
|
1587
|
-
|
|
2405
|
+
elif isinstance(item, SectionHeaderItem):
|
|
2406
|
+
|
|
2407
|
+
result += item.export_to_document_tokens(
|
|
2408
|
+
doc=self,
|
|
2409
|
+
new_line=delim,
|
|
2410
|
+
xsize=xsize,
|
|
2411
|
+
ysize=ysize,
|
|
2412
|
+
add_location=add_location,
|
|
2413
|
+
add_content=add_content,
|
|
2414
|
+
add_page_index=add_page_index,
|
|
2415
|
+
)
|
|
1588
2416
|
|
|
1589
|
-
|
|
1590
|
-
if isinstance(item, TextItem) and (item_type in labels):
|
|
2417
|
+
elif isinstance(item, TextItem) and (item.label in labels):
|
|
1591
2418
|
|
|
1592
|
-
|
|
2419
|
+
result += item.export_to_document_tokens(
|
|
1593
2420
|
doc=self,
|
|
1594
|
-
new_line=
|
|
2421
|
+
new_line=delim,
|
|
1595
2422
|
xsize=xsize,
|
|
1596
2423
|
ysize=ysize,
|
|
1597
2424
|
add_location=add_location,
|
|
@@ -1599,11 +2426,11 @@ class DoclingDocument(BaseModel):
|
|
|
1599
2426
|
add_page_index=add_page_index,
|
|
1600
2427
|
)
|
|
1601
2428
|
|
|
1602
|
-
elif isinstance(item, TableItem) and (
|
|
2429
|
+
elif isinstance(item, TableItem) and (item.label in labels):
|
|
1603
2430
|
|
|
1604
|
-
|
|
2431
|
+
result += item.export_to_document_tokens(
|
|
1605
2432
|
doc=self,
|
|
1606
|
-
new_line=
|
|
2433
|
+
new_line=delim,
|
|
1607
2434
|
xsize=xsize,
|
|
1608
2435
|
ysize=ysize,
|
|
1609
2436
|
add_caption=True,
|
|
@@ -1615,11 +2442,11 @@ class DoclingDocument(BaseModel):
|
|
|
1615
2442
|
add_page_index=add_page_index,
|
|
1616
2443
|
)
|
|
1617
2444
|
|
|
1618
|
-
elif isinstance(item, PictureItem) and (
|
|
2445
|
+
elif isinstance(item, PictureItem) and (item.label in labels):
|
|
1619
2446
|
|
|
1620
|
-
|
|
2447
|
+
result += item.export_to_document_tokens(
|
|
1621
2448
|
doc=self,
|
|
1622
|
-
new_line=
|
|
2449
|
+
new_line=delim,
|
|
1623
2450
|
xsize=xsize,
|
|
1624
2451
|
ysize=ysize,
|
|
1625
2452
|
add_caption=True,
|
|
@@ -1628,9 +2455,9 @@ class DoclingDocument(BaseModel):
|
|
|
1628
2455
|
add_page_index=add_page_index,
|
|
1629
2456
|
)
|
|
1630
2457
|
|
|
1631
|
-
|
|
2458
|
+
result += DocumentToken.END_DOCUMENT.value
|
|
1632
2459
|
|
|
1633
|
-
return
|
|
2460
|
+
return result
|
|
1634
2461
|
|
|
1635
2462
|
def _export_to_indented_text(
|
|
1636
2463
|
self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
|
docling_core/utils/file.py
CHANGED
|
@@ -65,3 +65,43 @@ def resolve_file_source(
|
|
|
65
65
|
except ValidationError:
|
|
66
66
|
raise ValueError(f"Unexpected source type encountered: {type(source)}")
|
|
67
67
|
return local_path
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def relative_path(src: Path, target: Path) -> Path:
|
|
71
|
+
"""Compute the relative path from `src` to `target`.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
src (str | Path): The source directory or file path (must be absolute).
|
|
75
|
+
target (str | Path): The target directory or file path (must be absolute).
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Path: The relative path from `src` to `target`.
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
ValueError: If either `src` or `target` is not an absolute path.
|
|
82
|
+
"""
|
|
83
|
+
src = Path(src).resolve()
|
|
84
|
+
target = Path(target).resolve()
|
|
85
|
+
|
|
86
|
+
# Ensure both paths are absolute
|
|
87
|
+
if not src.is_absolute():
|
|
88
|
+
raise ValueError(f"The source path must be absolute: {src}")
|
|
89
|
+
if not target.is_absolute():
|
|
90
|
+
raise ValueError(f"The target path must be absolute: {target}")
|
|
91
|
+
|
|
92
|
+
# Find the common ancestor
|
|
93
|
+
common_parts = []
|
|
94
|
+
for src_part, target_part in zip(src.parts, target.parts):
|
|
95
|
+
if src_part == target_part:
|
|
96
|
+
common_parts.append(src_part)
|
|
97
|
+
else:
|
|
98
|
+
break
|
|
99
|
+
|
|
100
|
+
# Determine the path to go up from src to the common ancestor
|
|
101
|
+
up_segments = [".."] * (len(src.parts) - len(common_parts))
|
|
102
|
+
|
|
103
|
+
# Add the path from the common ancestor to the target
|
|
104
|
+
down_segments = target.parts[len(common_parts) :]
|
|
105
|
+
|
|
106
|
+
# Combine and return the result
|
|
107
|
+
return Path(*up_segments, *down_segments)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -30,6 +30,7 @@ Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
|
30
30
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
31
31
|
Requires-Dist: pillow (>=10.3.0,<11.0.0)
|
|
32
32
|
Requires-Dist: pydantic (>=2.6.0,<2.10)
|
|
33
|
+
Requires-Dist: pyyaml (>=5.1,<7.0.0)
|
|
33
34
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
34
35
|
Project-URL: Repository, https://github.com/DS4SD/docling-core
|
|
35
36
|
Description-Content-Type: text/markdown
|
|
@@ -20,8 +20,8 @@ docling_core/transforms/chunker/hierarchical_chunker.py,sha256=V4FiOYqL0GgBqVB7x
|
|
|
20
20
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
21
21
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
22
22
|
docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
|
|
23
|
-
docling_core/types/doc/base.py,sha256=
|
|
24
|
-
docling_core/types/doc/document.py,sha256=
|
|
23
|
+
docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
|
|
24
|
+
docling_core/types/doc/document.py,sha256=apWwh2ixsVc0axtqJec3xKNuYmEwFDB00fQ2vJdKgBA,86018
|
|
25
25
|
docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
|
|
26
26
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
27
27
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
@@ -44,13 +44,13 @@ docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiK
|
|
|
44
44
|
docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
|
|
45
45
|
docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
|
|
46
46
|
docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
|
|
47
|
-
docling_core/utils/file.py,sha256=
|
|
47
|
+
docling_core/utils/file.py,sha256=ug4-z0KuthkEb_d5YDRPbY79PWfNSj9GYsi16xF2sDA,3699
|
|
48
48
|
docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
|
|
49
49
|
docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
|
|
50
50
|
docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
|
|
51
51
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
52
|
-
docling_core-2.
|
|
53
|
-
docling_core-2.
|
|
54
|
-
docling_core-2.
|
|
55
|
-
docling_core-2.
|
|
56
|
-
docling_core-2.
|
|
52
|
+
docling_core-2.5.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
53
|
+
docling_core-2.5.1.dist-info/METADATA,sha256=9K3Hip_Uev5copWGL0ragXG-N5uFHQiF2SNk0se2m_o,5468
|
|
54
|
+
docling_core-2.5.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
55
|
+
docling_core-2.5.1.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
|
|
56
|
+
docling_core-2.5.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|