docling-core 2.4.0__py3-none-any.whl → 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/types/doc/base.py +3 -2
- docling_core/types/doc/document.py +905 -88
- docling_core/utils/file.py +40 -0
- {docling_core-2.4.0.dist-info → docling_core-2.5.0.dist-info}/METADATA +3 -2
- {docling_core-2.4.0.dist-info → docling_core-2.5.0.dist-info}/RECORD +8 -8
- {docling_core-2.4.0.dist-info → docling_core-2.5.0.dist-info}/LICENSE +0 -0
- {docling_core-2.4.0.dist-info → docling_core-2.5.0.dist-info}/WHEEL +0 -0
- {docling_core-2.4.0.dist-info → docling_core-2.5.0.dist-info}/entry_points.txt +0 -0
docling_core/types/doc/base.py
CHANGED
|
@@ -10,8 +10,9 @@ from pydantic import BaseModel
|
|
|
10
10
|
class ImageRefMode(str, Enum):
|
|
11
11
|
"""ImageRefMode."""
|
|
12
12
|
|
|
13
|
-
PLACEHOLDER = "placeholder"
|
|
14
|
-
EMBEDDED = "embedded"
|
|
13
|
+
PLACEHOLDER = "placeholder" # just a place-holder
|
|
14
|
+
EMBEDDED = "embedded" # embed the image as a base64
|
|
15
|
+
REFERENCED = "referenced" # reference the image via uri
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class CoordOrigin(str, Enum):
|
|
@@ -1,15 +1,22 @@
|
|
|
1
1
|
"""Models for the Docling Document data type."""
|
|
2
2
|
|
|
3
3
|
import base64
|
|
4
|
+
import copy
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
4
7
|
import mimetypes
|
|
8
|
+
import os
|
|
5
9
|
import re
|
|
6
10
|
import sys
|
|
7
11
|
import textwrap
|
|
8
12
|
import typing
|
|
9
13
|
from io import BytesIO
|
|
14
|
+
from pathlib import Path
|
|
10
15
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
16
|
+
from urllib.parse import unquote
|
|
11
17
|
|
|
12
18
|
import pandas as pd
|
|
19
|
+
import yaml
|
|
13
20
|
from PIL import Image as PILImage
|
|
14
21
|
from pydantic import (
|
|
15
22
|
AnyUrl,
|
|
@@ -30,6 +37,7 @@ from docling_core.types.doc import BoundingBox, Size
|
|
|
30
37
|
from docling_core.types.doc.base import ImageRefMode
|
|
31
38
|
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
|
|
32
39
|
from docling_core.types.legacy_doc.tokens import DocumentToken
|
|
40
|
+
from docling_core.utils.file import relative_path
|
|
33
41
|
|
|
34
42
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
35
43
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
@@ -436,21 +444,25 @@ class ImageRef(BaseModel):
|
|
|
436
444
|
mimetype: str
|
|
437
445
|
dpi: int
|
|
438
446
|
size: Size
|
|
439
|
-
uri: AnyUrl
|
|
447
|
+
uri: Union[AnyUrl, Path]
|
|
440
448
|
_pil: Optional[PILImage.Image] = None
|
|
441
449
|
|
|
442
450
|
@property
|
|
443
|
-
def pil_image(self) -> PILImage.Image:
|
|
451
|
+
def pil_image(self) -> Optional[PILImage.Image]:
|
|
444
452
|
"""Return the PIL Image."""
|
|
445
453
|
if self._pil is not None:
|
|
446
454
|
return self._pil
|
|
447
455
|
|
|
448
|
-
if
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
self.
|
|
456
|
+
if isinstance(self.uri, AnyUrl):
|
|
457
|
+
if self.uri.scheme == "data":
|
|
458
|
+
encoded_img = str(self.uri).split(",")[1]
|
|
459
|
+
decoded_img = base64.b64decode(encoded_img)
|
|
460
|
+
self._pil = PILImage.open(BytesIO(decoded_img))
|
|
461
|
+
elif self.uri.scheme == "file":
|
|
462
|
+
self._pil = PILImage.open(unquote(str(self.uri.path)))
|
|
463
|
+
# else: Handle http request or other protocols...
|
|
464
|
+
elif isinstance(self.uri, Path):
|
|
465
|
+
self._pil = PILImage.open(self.uri)
|
|
454
466
|
|
|
455
467
|
return self._pil
|
|
456
468
|
|
|
@@ -566,6 +578,8 @@ class DocItem(
|
|
|
566
578
|
return None
|
|
567
579
|
|
|
568
580
|
page_image = page.image.pil_image
|
|
581
|
+
if not page_image:
|
|
582
|
+
return None
|
|
569
583
|
crop_bbox = (
|
|
570
584
|
self.prov[0]
|
|
571
585
|
.bbox.to_top_left_origin(page_height=page.size.height)
|
|
@@ -631,6 +645,50 @@ class SectionHeaderItem(TextItem):
|
|
|
631
645
|
label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
|
|
632
646
|
level: LevelNumber
|
|
633
647
|
|
|
648
|
+
def export_to_document_tokens(
|
|
649
|
+
self,
|
|
650
|
+
doc: "DoclingDocument",
|
|
651
|
+
new_line: str = "\n",
|
|
652
|
+
xsize: int = 100,
|
|
653
|
+
ysize: int = 100,
|
|
654
|
+
add_location: bool = True,
|
|
655
|
+
add_content: bool = True,
|
|
656
|
+
add_page_index: bool = True,
|
|
657
|
+
):
|
|
658
|
+
r"""Export text element to document tokens format.
|
|
659
|
+
|
|
660
|
+
:param doc: "DoclingDocument":
|
|
661
|
+
:param new_line: str: (Default value = "\n")
|
|
662
|
+
:param xsize: int: (Default value = 100)
|
|
663
|
+
:param ysize: int: (Default value = 100)
|
|
664
|
+
:param add_location: bool: (Default value = True)
|
|
665
|
+
:param add_content: bool: (Default value = True)
|
|
666
|
+
:param add_page_index: bool: (Default value = True)
|
|
667
|
+
|
|
668
|
+
"""
|
|
669
|
+
body = f"<{self.label.value}_level_{self.level}>"
|
|
670
|
+
|
|
671
|
+
# TODO: This must be done through an explicit mapping.
|
|
672
|
+
# assert DocumentToken.is_known_token(
|
|
673
|
+
# body
|
|
674
|
+
# ), f"failed DocumentToken.is_known_token({body})"
|
|
675
|
+
|
|
676
|
+
if add_location:
|
|
677
|
+
body += self.get_location_tokens(
|
|
678
|
+
doc=doc,
|
|
679
|
+
new_line="",
|
|
680
|
+
xsize=xsize,
|
|
681
|
+
ysize=ysize,
|
|
682
|
+
add_page_index=add_page_index,
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
if add_content and self.text is not None:
|
|
686
|
+
body += self.text.strip()
|
|
687
|
+
|
|
688
|
+
body += f"</{self.label.value}_level_{self.level}>{new_line}"
|
|
689
|
+
|
|
690
|
+
return body
|
|
691
|
+
|
|
634
692
|
|
|
635
693
|
class ListItem(TextItem):
|
|
636
694
|
"""SectionItem."""
|
|
@@ -677,6 +735,152 @@ class PictureItem(FloatingItem):
|
|
|
677
735
|
|
|
678
736
|
annotations: List[PictureDataType] = []
|
|
679
737
|
|
|
738
|
+
# Convert the image to Base64
|
|
739
|
+
def _image_to_base64(self, pil_image, format="PNG"):
|
|
740
|
+
"""Base64 representation of the image."""
|
|
741
|
+
buffered = BytesIO()
|
|
742
|
+
pil_image.save(buffered, format=format) # Save the image to the byte stream
|
|
743
|
+
img_bytes = buffered.getvalue() # Get the byte data
|
|
744
|
+
img_base64 = base64.b64encode(img_bytes).decode(
|
|
745
|
+
"utf-8"
|
|
746
|
+
) # Encode to Base64 and decode to string
|
|
747
|
+
return img_base64
|
|
748
|
+
|
|
749
|
+
def _image_to_hexhash(self) -> Optional[str]:
|
|
750
|
+
"""Hexash from the image."""
|
|
751
|
+
if self.image is not None and self.image._pil is not None:
|
|
752
|
+
# Convert the image to raw bytes
|
|
753
|
+
image_bytes = self.image._pil.tobytes()
|
|
754
|
+
|
|
755
|
+
# Create a hash object (e.g., SHA-256)
|
|
756
|
+
hasher = hashlib.sha256()
|
|
757
|
+
|
|
758
|
+
# Feed the image bytes into the hash object
|
|
759
|
+
hasher.update(image_bytes)
|
|
760
|
+
|
|
761
|
+
# Get the hexadecimal representation of the hash
|
|
762
|
+
return hasher.hexdigest()
|
|
763
|
+
|
|
764
|
+
return None
|
|
765
|
+
|
|
766
|
+
def export_to_markdown(
|
|
767
|
+
self,
|
|
768
|
+
doc: "DoclingDocument",
|
|
769
|
+
add_caption: bool = True,
|
|
770
|
+
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
771
|
+
image_placeholder: str = "<!-- image -->",
|
|
772
|
+
) -> str:
|
|
773
|
+
"""Export picture to Markdown format."""
|
|
774
|
+
default_response = "\n" + image_placeholder + "\n"
|
|
775
|
+
error_response = (
|
|
776
|
+
"\n<!-- 🖼️❌ Image not available. "
|
|
777
|
+
"Please use `PdfPipelineOptions(generate_picture_images=True)`"
|
|
778
|
+
" --> \n"
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
782
|
+
return default_response
|
|
783
|
+
|
|
784
|
+
elif image_mode == ImageRefMode.EMBEDDED:
|
|
785
|
+
|
|
786
|
+
# short-cut: we already have the image in base64
|
|
787
|
+
if (
|
|
788
|
+
isinstance(self.image, ImageRef)
|
|
789
|
+
and isinstance(self.image.uri, AnyUrl)
|
|
790
|
+
and self.image.uri.scheme == "data"
|
|
791
|
+
):
|
|
792
|
+
text = f"\n\n"
|
|
793
|
+
return text
|
|
794
|
+
|
|
795
|
+
# get the self.image._pil or crop it out of the page-image
|
|
796
|
+
img = self.get_image(doc)
|
|
797
|
+
|
|
798
|
+
if img is not None:
|
|
799
|
+
imgb64 = self._image_to_base64(img)
|
|
800
|
+
text = f"\n\n"
|
|
801
|
+
|
|
802
|
+
return text
|
|
803
|
+
else:
|
|
804
|
+
return error_response
|
|
805
|
+
|
|
806
|
+
elif image_mode == ImageRefMode.REFERENCED:
|
|
807
|
+
if not isinstance(self.image, ImageRef) or (
|
|
808
|
+
isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
|
|
809
|
+
):
|
|
810
|
+
return default_response
|
|
811
|
+
|
|
812
|
+
if (
|
|
813
|
+
isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "file"
|
|
814
|
+
) or isinstance(self.image.uri, Path):
|
|
815
|
+
text = f"\n})\n"
|
|
816
|
+
return text
|
|
817
|
+
|
|
818
|
+
else:
|
|
819
|
+
return default_response
|
|
820
|
+
|
|
821
|
+
else:
|
|
822
|
+
return default_response
|
|
823
|
+
|
|
824
|
+
def export_to_html(
|
|
825
|
+
self,
|
|
826
|
+
doc: "DoclingDocument",
|
|
827
|
+
add_caption: bool = True,
|
|
828
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
829
|
+
) -> str:
|
|
830
|
+
"""Export picture to HTML format."""
|
|
831
|
+
text = ""
|
|
832
|
+
if add_caption and len(self.captions):
|
|
833
|
+
text = self.caption_text(doc)
|
|
834
|
+
|
|
835
|
+
caption_text = ""
|
|
836
|
+
if len(text) > 0:
|
|
837
|
+
caption_text = f"<figcaption>{text}</figcaption>"
|
|
838
|
+
|
|
839
|
+
default_response = f"<figure>{caption_text}</figure>"
|
|
840
|
+
|
|
841
|
+
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
842
|
+
return default_response
|
|
843
|
+
|
|
844
|
+
elif image_mode == ImageRefMode.EMBEDDED:
|
|
845
|
+
# short-cut: we already have the image in base64
|
|
846
|
+
if (
|
|
847
|
+
isinstance(self.image, ImageRef)
|
|
848
|
+
and isinstance(self.image.uri, AnyUrl)
|
|
849
|
+
and self.image.uri.scheme == "data"
|
|
850
|
+
):
|
|
851
|
+
img_text = f'<img src="{self.image.uri}">'
|
|
852
|
+
return f"<figure>{caption_text}{img_text}</figure>"
|
|
853
|
+
|
|
854
|
+
# get the self.image._pil or crop it out of the page-image
|
|
855
|
+
img = self.get_image(doc)
|
|
856
|
+
|
|
857
|
+
if img is not None:
|
|
858
|
+
imgb64 = self._image_to_base64(img)
|
|
859
|
+
img_text = f'<img src="data:image/png;base64,{imgb64}">'
|
|
860
|
+
|
|
861
|
+
return f"<figure>{caption_text}{img_text}</figure>"
|
|
862
|
+
else:
|
|
863
|
+
return default_response
|
|
864
|
+
|
|
865
|
+
elif image_mode == ImageRefMode.REFERENCED:
|
|
866
|
+
|
|
867
|
+
if not isinstance(self.image, ImageRef) or (
|
|
868
|
+
isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
|
|
869
|
+
):
|
|
870
|
+
return default_response
|
|
871
|
+
|
|
872
|
+
if (
|
|
873
|
+
isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "file"
|
|
874
|
+
) or isinstance(self.image.uri, Path):
|
|
875
|
+
img_text = f'<img src="{str(self.image.uri)}">'
|
|
876
|
+
return f"<figure>{caption_text}{img_text}</figure>"
|
|
877
|
+
|
|
878
|
+
else:
|
|
879
|
+
return default_response
|
|
880
|
+
|
|
881
|
+
else:
|
|
882
|
+
return default_response
|
|
883
|
+
|
|
680
884
|
def export_to_document_tokens(
|
|
681
885
|
self,
|
|
682
886
|
doc: "DoclingDocument",
|
|
@@ -804,14 +1008,21 @@ class TableItem(FloatingItem):
|
|
|
804
1008
|
)
|
|
805
1009
|
return md_table
|
|
806
1010
|
|
|
807
|
-
def export_to_html(self) -> str:
|
|
1011
|
+
def export_to_html(self, doc: "DoclingDocument", add_caption: bool = True) -> str:
|
|
808
1012
|
"""Export the table as html."""
|
|
809
1013
|
body = ""
|
|
810
1014
|
nrows = self.data.num_rows
|
|
811
1015
|
ncols = self.data.num_cols
|
|
812
1016
|
|
|
813
|
-
|
|
1017
|
+
text = ""
|
|
1018
|
+
if add_caption and len(self.captions):
|
|
1019
|
+
text = self.caption_text(doc)
|
|
1020
|
+
|
|
1021
|
+
if len(self.data.table_cells) == 0:
|
|
814
1022
|
return ""
|
|
1023
|
+
|
|
1024
|
+
body = ""
|
|
1025
|
+
|
|
815
1026
|
for i in range(nrows):
|
|
816
1027
|
body += "<tr>"
|
|
817
1028
|
for j in range(ncols):
|
|
@@ -844,7 +1055,15 @@ class TableItem(FloatingItem):
|
|
|
844
1055
|
|
|
845
1056
|
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
846
1057
|
body += "</tr>"
|
|
847
|
-
|
|
1058
|
+
|
|
1059
|
+
if len(text) > 0 and len(body) > 0:
|
|
1060
|
+
body = f"<table><caption>{text}</caption><tbody>{body}</tbody></table>"
|
|
1061
|
+
elif len(text) == 0 and len(body) > 0:
|
|
1062
|
+
body = f"<table><tbody>{body}</tbody></table>"
|
|
1063
|
+
elif len(text) > 0 and len(body) == 0:
|
|
1064
|
+
body = f"<table><caption>{text}</caption></table>"
|
|
1065
|
+
else:
|
|
1066
|
+
body = "<table></table>"
|
|
848
1067
|
|
|
849
1068
|
return body
|
|
850
1069
|
|
|
@@ -981,6 +1200,23 @@ class PageItem(BaseModel):
|
|
|
981
1200
|
class DoclingDocument(BaseModel):
|
|
982
1201
|
"""DoclingDocument."""
|
|
983
1202
|
|
|
1203
|
+
_HTML_DEFAULT_HEAD: str = r"""<head>
|
|
1204
|
+
<meta charset="UTF-8">
|
|
1205
|
+
<style>
|
|
1206
|
+
table {
|
|
1207
|
+
border-collapse: separate;
|
|
1208
|
+
/* Maintain separate borders */
|
|
1209
|
+
border-spacing: 5px; /*
|
|
1210
|
+
Space between cells */
|
|
1211
|
+
width: 50%;
|
|
1212
|
+
}
|
|
1213
|
+
th, td {
|
|
1214
|
+
border: 1px solid black;
|
|
1215
|
+
/* Add lines etween cells */
|
|
1216
|
+
padding: 8px; }
|
|
1217
|
+
</style>
|
|
1218
|
+
</head>"""
|
|
1219
|
+
|
|
984
1220
|
schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
|
|
985
1221
|
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
|
|
986
1222
|
CURRENT_VERSION
|
|
@@ -1045,7 +1281,7 @@ class DoclingDocument(BaseModel):
|
|
|
1045
1281
|
prov: Optional[ProvenanceItem] = None,
|
|
1046
1282
|
parent: Optional[GroupItem] = None,
|
|
1047
1283
|
):
|
|
1048
|
-
"""
|
|
1284
|
+
"""add_list_item.
|
|
1049
1285
|
|
|
1050
1286
|
:param label: str:
|
|
1051
1287
|
:param text: str:
|
|
@@ -1088,7 +1324,7 @@ class DoclingDocument(BaseModel):
|
|
|
1088
1324
|
prov: Optional[ProvenanceItem] = None,
|
|
1089
1325
|
parent: Optional[GroupItem] = None,
|
|
1090
1326
|
):
|
|
1091
|
-
"""
|
|
1327
|
+
"""add_text.
|
|
1092
1328
|
|
|
1093
1329
|
:param label: str:
|
|
1094
1330
|
:param text: str:
|
|
@@ -1097,28 +1333,41 @@ class DoclingDocument(BaseModel):
|
|
|
1097
1333
|
:param parent: Optional[GroupItem]: (Default value = None)
|
|
1098
1334
|
|
|
1099
1335
|
"""
|
|
1100
|
-
|
|
1101
|
-
|
|
1336
|
+
# Catch a few cases that are in principle allowed
|
|
1337
|
+
# but that will create confusion down the road
|
|
1338
|
+
if label in [DocItemLabel.TITLE]:
|
|
1339
|
+
return self.add_title(text=text, orig=orig, prov=prov, parent=parent)
|
|
1102
1340
|
|
|
1103
|
-
|
|
1104
|
-
orig =
|
|
1341
|
+
elif label in [DocItemLabel.LIST_ITEM]:
|
|
1342
|
+
return self.add_list_item(text=text, orig=orig, prov=prov, parent=parent)
|
|
1105
1343
|
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
text_item = TextItem(
|
|
1109
|
-
label=label,
|
|
1110
|
-
text=text,
|
|
1111
|
-
orig=orig,
|
|
1112
|
-
self_ref=cref,
|
|
1113
|
-
parent=parent.get_ref(),
|
|
1114
|
-
)
|
|
1115
|
-
if prov:
|
|
1116
|
-
text_item.prov.append(prov)
|
|
1344
|
+
elif label in [DocItemLabel.SECTION_HEADER]:
|
|
1345
|
+
return self.add_heading(text=text, orig=orig, prov=prov, parent=parent)
|
|
1117
1346
|
|
|
1118
|
-
|
|
1119
|
-
parent.children.append(RefItem(cref=cref))
|
|
1347
|
+
else:
|
|
1120
1348
|
|
|
1121
|
-
|
|
1349
|
+
if not parent:
|
|
1350
|
+
parent = self.body
|
|
1351
|
+
|
|
1352
|
+
if not orig:
|
|
1353
|
+
orig = text
|
|
1354
|
+
|
|
1355
|
+
text_index = len(self.texts)
|
|
1356
|
+
cref = f"#/texts/{text_index}"
|
|
1357
|
+
text_item = TextItem(
|
|
1358
|
+
label=label,
|
|
1359
|
+
text=text,
|
|
1360
|
+
orig=orig,
|
|
1361
|
+
self_ref=cref,
|
|
1362
|
+
parent=parent.get_ref(),
|
|
1363
|
+
)
|
|
1364
|
+
if prov:
|
|
1365
|
+
text_item.prov.append(prov)
|
|
1366
|
+
|
|
1367
|
+
self.texts.append(text_item)
|
|
1368
|
+
parent.children.append(RefItem(cref=cref))
|
|
1369
|
+
|
|
1370
|
+
return text_item
|
|
1122
1371
|
|
|
1123
1372
|
def add_table(
|
|
1124
1373
|
self,
|
|
@@ -1170,7 +1419,6 @@ class DoclingDocument(BaseModel):
|
|
|
1170
1419
|
:param RefItem]]: (Default value = None)
|
|
1171
1420
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1172
1421
|
:param parent: Optional[GroupItem]: (Default value = None)
|
|
1173
|
-
|
|
1174
1422
|
"""
|
|
1175
1423
|
if not parent:
|
|
1176
1424
|
parent = self.body
|
|
@@ -1195,6 +1443,43 @@ class DoclingDocument(BaseModel):
|
|
|
1195
1443
|
|
|
1196
1444
|
return fig_item
|
|
1197
1445
|
|
|
1446
|
+
def add_title(
|
|
1447
|
+
self,
|
|
1448
|
+
text: str,
|
|
1449
|
+
orig: Optional[str] = None,
|
|
1450
|
+
prov: Optional[ProvenanceItem] = None,
|
|
1451
|
+
parent: Optional[GroupItem] = None,
|
|
1452
|
+
):
|
|
1453
|
+
"""add_title.
|
|
1454
|
+
|
|
1455
|
+
:param text: str:
|
|
1456
|
+
:param orig: Optional[str]: (Default value = None)
|
|
1457
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1458
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
1459
|
+
"""
|
|
1460
|
+
if not parent:
|
|
1461
|
+
parent = self.body
|
|
1462
|
+
|
|
1463
|
+
if not orig:
|
|
1464
|
+
orig = text
|
|
1465
|
+
|
|
1466
|
+
text_index = len(self.texts)
|
|
1467
|
+
cref = f"#/texts/{text_index}"
|
|
1468
|
+
text_item = TextItem(
|
|
1469
|
+
label=DocItemLabel.TITLE,
|
|
1470
|
+
text=text,
|
|
1471
|
+
orig=orig,
|
|
1472
|
+
self_ref=cref,
|
|
1473
|
+
parent=parent.get_ref(),
|
|
1474
|
+
)
|
|
1475
|
+
if prov:
|
|
1476
|
+
text_item.prov.append(prov)
|
|
1477
|
+
|
|
1478
|
+
self.texts.append(text_item)
|
|
1479
|
+
parent.children.append(RefItem(cref=cref))
|
|
1480
|
+
|
|
1481
|
+
return text_item
|
|
1482
|
+
|
|
1198
1483
|
def add_heading(
|
|
1199
1484
|
self,
|
|
1200
1485
|
text: str,
|
|
@@ -1211,7 +1496,6 @@ class DoclingDocument(BaseModel):
|
|
|
1211
1496
|
:param level: LevelNumber: (Default value = 1)
|
|
1212
1497
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1213
1498
|
:param parent: Optional[GroupItem]: (Default value = None)
|
|
1214
|
-
|
|
1215
1499
|
"""
|
|
1216
1500
|
if not parent:
|
|
1217
1501
|
parent = self.body
|
|
@@ -1297,17 +1581,220 @@ class DoclingDocument(BaseModel):
|
|
|
1297
1581
|
page_no=page_no,
|
|
1298
1582
|
)
|
|
1299
1583
|
|
|
1584
|
+
def _clear_picture_pil_cache(self):
|
|
1585
|
+
"""Clear cache storage of all images."""
|
|
1586
|
+
for item, level in self.iterate_items(with_groups=False):
|
|
1587
|
+
if isinstance(item, PictureItem):
|
|
1588
|
+
if item.image is not None and item.image._pil is not None:
|
|
1589
|
+
item.image._pil.close()
|
|
1590
|
+
|
|
1591
|
+
def _list_images_on_disk(self) -> List[Path]:
|
|
1592
|
+
"""List all images on disk."""
|
|
1593
|
+
result: List[Path] = []
|
|
1594
|
+
|
|
1595
|
+
for item, level in self.iterate_items(with_groups=False):
|
|
1596
|
+
if isinstance(item, PictureItem):
|
|
1597
|
+
if item.image is not None:
|
|
1598
|
+
if (
|
|
1599
|
+
isinstance(item.image.uri, AnyUrl)
|
|
1600
|
+
and item.image.uri.scheme == "file"
|
|
1601
|
+
and item.image.uri.path is not None
|
|
1602
|
+
):
|
|
1603
|
+
local_path = Path(unquote(item.image.uri.path))
|
|
1604
|
+
result.append(local_path)
|
|
1605
|
+
elif isinstance(item.image.uri, Path):
|
|
1606
|
+
result.append(item.image.uri)
|
|
1607
|
+
|
|
1608
|
+
return result
|
|
1609
|
+
|
|
1610
|
+
def _with_embedded_pictures(self) -> "DoclingDocument":
|
|
1611
|
+
"""Document with embedded images.
|
|
1612
|
+
|
|
1613
|
+
Creates a copy of this document where all pictures referenced
|
|
1614
|
+
through a file URI are turned into base64 embedded form.
|
|
1615
|
+
"""
|
|
1616
|
+
result: DoclingDocument = copy.deepcopy(self)
|
|
1617
|
+
|
|
1618
|
+
for ix, (item, level) in enumerate(result.iterate_items(with_groups=True)):
|
|
1619
|
+
if isinstance(item, PictureItem):
|
|
1620
|
+
|
|
1621
|
+
if item.image is not None:
|
|
1622
|
+
if (
|
|
1623
|
+
isinstance(item.image.uri, AnyUrl)
|
|
1624
|
+
and item.image.uri.scheme == "file"
|
|
1625
|
+
):
|
|
1626
|
+
assert isinstance(item.image.uri.path, str)
|
|
1627
|
+
tmp_image = PILImage.open(str(unquote(item.image.uri.path)))
|
|
1628
|
+
item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
|
|
1629
|
+
|
|
1630
|
+
elif isinstance(item.image.uri, Path):
|
|
1631
|
+
tmp_image = PILImage.open(str(item.image.uri))
|
|
1632
|
+
item.image = ImageRef.from_pil(tmp_image, dpi=item.image.dpi)
|
|
1633
|
+
|
|
1634
|
+
return result
|
|
1635
|
+
|
|
1636
|
+
def _with_pictures_refs(
|
|
1637
|
+
self, image_dir: Path, reference_path: Optional[Path] = None
|
|
1638
|
+
) -> "DoclingDocument":
|
|
1639
|
+
"""Document with images as refs.
|
|
1640
|
+
|
|
1641
|
+
Creates a copy of this document where all picture data is
|
|
1642
|
+
saved to image_dir and referenced through file URIs.
|
|
1643
|
+
"""
|
|
1644
|
+
result: DoclingDocument = copy.deepcopy(self)
|
|
1645
|
+
|
|
1646
|
+
img_count = 0
|
|
1647
|
+
image_dir.mkdir(parents=True, exist_ok=True)
|
|
1648
|
+
|
|
1649
|
+
if image_dir.is_dir():
|
|
1650
|
+
for item, level in result.iterate_items(with_groups=False):
|
|
1651
|
+
if isinstance(item, PictureItem):
|
|
1652
|
+
|
|
1653
|
+
if (
|
|
1654
|
+
item.image is not None
|
|
1655
|
+
and isinstance(item.image.uri, AnyUrl)
|
|
1656
|
+
and item.image.uri.scheme == "data"
|
|
1657
|
+
and item.image.pil_image is not None
|
|
1658
|
+
):
|
|
1659
|
+
img = item.image.pil_image
|
|
1660
|
+
|
|
1661
|
+
hexhash = item._image_to_hexhash()
|
|
1662
|
+
|
|
1663
|
+
# loc_path = image_dir / f"image_{img_count:06}.png"
|
|
1664
|
+
if hexhash is not None:
|
|
1665
|
+
loc_path = image_dir / f"image_{img_count:06}_{hexhash}.png"
|
|
1666
|
+
|
|
1667
|
+
img.save(loc_path)
|
|
1668
|
+
if reference_path is not None:
|
|
1669
|
+
obj_path = relative_path(
|
|
1670
|
+
reference_path.resolve(), loc_path.resolve()
|
|
1671
|
+
)
|
|
1672
|
+
else:
|
|
1673
|
+
obj_path = loc_path
|
|
1674
|
+
|
|
1675
|
+
item.image.uri = Path(obj_path)
|
|
1676
|
+
|
|
1677
|
+
# if item.image._pil is not None:
|
|
1678
|
+
# item.image._pil.close()
|
|
1679
|
+
|
|
1680
|
+
img_count += 1
|
|
1681
|
+
|
|
1682
|
+
return result
|
|
1683
|
+
|
|
1300
1684
|
def print_element_tree(self):
|
|
1301
|
-
"""
|
|
1685
|
+
"""Print_element_tree."""
|
|
1302
1686
|
for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
|
|
1303
1687
|
if isinstance(item, GroupItem):
|
|
1304
1688
|
print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
|
|
1305
1689
|
elif isinstance(item, DocItem):
|
|
1306
1690
|
print(" " * level, f"{ix}: {item.label.value}")
|
|
1307
1691
|
|
|
1308
|
-
def
|
|
1309
|
-
"""
|
|
1310
|
-
|
|
1692
|
+
def export_to_element_tree(self) -> str:
|
|
1693
|
+
"""Export_to_element_tree."""
|
|
1694
|
+
texts = []
|
|
1695
|
+
for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
|
|
1696
|
+
if isinstance(item, GroupItem):
|
|
1697
|
+
texts.append(
|
|
1698
|
+
" " * level + f"{ix}: {item.label.value} with name={item.name}"
|
|
1699
|
+
)
|
|
1700
|
+
elif isinstance(item, DocItem):
|
|
1701
|
+
texts.append(" " * level + f"{ix}: {item.label.value}")
|
|
1702
|
+
|
|
1703
|
+
return "\n".join(texts)
|
|
1704
|
+
|
|
1705
|
+
def save_as_json(
|
|
1706
|
+
self,
|
|
1707
|
+
filename: Path,
|
|
1708
|
+
artifacts_dir: Optional[Path] = None,
|
|
1709
|
+
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
1710
|
+
indent: int = 2,
|
|
1711
|
+
):
|
|
1712
|
+
"""Save as json."""
|
|
1713
|
+
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
1714
|
+
|
|
1715
|
+
if image_mode == ImageRefMode.REFERENCED:
|
|
1716
|
+
os.makedirs(artifacts_dir, exist_ok=True)
|
|
1717
|
+
|
|
1718
|
+
new_doc = self._make_copy_with_refmode(
|
|
1719
|
+
artifacts_dir, image_mode, reference_path=reference_path
|
|
1720
|
+
)
|
|
1721
|
+
|
|
1722
|
+
out = new_doc.export_to_dict()
|
|
1723
|
+
with open(filename, "w") as fw:
|
|
1724
|
+
json.dump(out, fw, indent=indent)
|
|
1725
|
+
|
|
1726
|
+
def save_as_yaml(
|
|
1727
|
+
self,
|
|
1728
|
+
filename: Path,
|
|
1729
|
+
artifacts_dir: Optional[Path] = None,
|
|
1730
|
+
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
1731
|
+
default_flow_style: bool = False,
|
|
1732
|
+
):
|
|
1733
|
+
"""Save as yaml."""
|
|
1734
|
+
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
1735
|
+
|
|
1736
|
+
if image_mode == ImageRefMode.REFERENCED:
|
|
1737
|
+
os.makedirs(artifacts_dir, exist_ok=True)
|
|
1738
|
+
|
|
1739
|
+
new_doc = self._make_copy_with_refmode(
|
|
1740
|
+
artifacts_dir, image_mode, reference_path=reference_path
|
|
1741
|
+
)
|
|
1742
|
+
|
|
1743
|
+
out = new_doc.export_to_dict()
|
|
1744
|
+
with open(filename, "w") as fw:
|
|
1745
|
+
yaml.dump(out, fw, default_flow_style=default_flow_style)
|
|
1746
|
+
|
|
1747
|
+
def export_to_dict(
|
|
1748
|
+
self,
|
|
1749
|
+
mode: str = "json",
|
|
1750
|
+
by_alias: bool = True,
|
|
1751
|
+
exclude_none: bool = True,
|
|
1752
|
+
) -> Dict:
|
|
1753
|
+
"""Export to dict."""
|
|
1754
|
+
out = self.model_dump(mode=mode, by_alias=by_alias, exclude_none=exclude_none)
|
|
1755
|
+
|
|
1756
|
+
return out
|
|
1757
|
+
|
|
1758
|
+
def save_as_markdown(
|
|
1759
|
+
self,
|
|
1760
|
+
filename: Path,
|
|
1761
|
+
artifacts_dir: Optional[Path] = None,
|
|
1762
|
+
delim: str = "\n",
|
|
1763
|
+
from_element: int = 0,
|
|
1764
|
+
to_element: int = sys.maxsize,
|
|
1765
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
1766
|
+
strict_text: bool = False,
|
|
1767
|
+
image_placeholder: str = "<!-- image -->",
|
|
1768
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
1769
|
+
indent: int = 4,
|
|
1770
|
+
text_width: int = -1,
|
|
1771
|
+
page_no: Optional[int] = None,
|
|
1772
|
+
):
|
|
1773
|
+
"""Save to markdown."""
|
|
1774
|
+
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
1775
|
+
|
|
1776
|
+
if image_mode == ImageRefMode.REFERENCED:
|
|
1777
|
+
os.makedirs(artifacts_dir, exist_ok=True)
|
|
1778
|
+
|
|
1779
|
+
new_doc = self._make_copy_with_refmode(
|
|
1780
|
+
artifacts_dir, image_mode, reference_path=reference_path
|
|
1781
|
+
)
|
|
1782
|
+
|
|
1783
|
+
md_out = new_doc.export_to_markdown(
|
|
1784
|
+
delim=delim,
|
|
1785
|
+
from_element=from_element,
|
|
1786
|
+
to_element=to_element,
|
|
1787
|
+
labels=labels,
|
|
1788
|
+
strict_text=strict_text,
|
|
1789
|
+
image_placeholder=image_placeholder,
|
|
1790
|
+
image_mode=image_mode,
|
|
1791
|
+
indent=indent,
|
|
1792
|
+
text_width=text_width,
|
|
1793
|
+
page_no=page_no,
|
|
1794
|
+
)
|
|
1795
|
+
|
|
1796
|
+
with open(filename, "w") as fw:
|
|
1797
|
+
fw.write(md_out)
|
|
1311
1798
|
|
|
1312
1799
|
def export_to_markdown( # noqa: C901
|
|
1313
1800
|
self,
|
|
@@ -1461,22 +1948,13 @@ class DoclingDocument(BaseModel):
|
|
|
1461
1948
|
in_list = False
|
|
1462
1949
|
mdtexts.append(item.caption_text(self))
|
|
1463
1950
|
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
)
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
elif image_mode == ImageRefMode.EMBEDDED and not isinstance(
|
|
1472
|
-
item.image, ImageRef
|
|
1473
|
-
):
|
|
1474
|
-
text = (
|
|
1475
|
-
"<!-- 🖼️❌ Image not available. "
|
|
1476
|
-
"Please use `PdfPipelineOptions(generate_picture_images=True)`"
|
|
1477
|
-
" --> "
|
|
1478
|
-
)
|
|
1479
|
-
mdtexts.append(text)
|
|
1951
|
+
line = item.export_to_markdown(
|
|
1952
|
+
doc=self,
|
|
1953
|
+
image_placeholder=image_placeholder,
|
|
1954
|
+
image_mode=image_mode,
|
|
1955
|
+
)
|
|
1956
|
+
|
|
1957
|
+
mdtexts.append(line)
|
|
1480
1958
|
|
|
1481
1959
|
elif isinstance(item, DocItem) and item.label in labels:
|
|
1482
1960
|
in_list = False
|
|
@@ -1518,11 +1996,288 @@ class DoclingDocument(BaseModel):
|
|
|
1518
1996
|
image_placeholder="",
|
|
1519
1997
|
)
|
|
1520
1998
|
|
|
1521
|
-
def
|
|
1999
|
+
def save_as_html(
|
|
2000
|
+
self,
|
|
2001
|
+
filename: Path,
|
|
2002
|
+
artifacts_dir: Optional[Path] = None,
|
|
2003
|
+
from_element: int = 0,
|
|
2004
|
+
to_element: int = sys.maxsize,
|
|
2005
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
2006
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2007
|
+
page_no: Optional[int] = None,
|
|
2008
|
+
html_lang: str = "en",
|
|
2009
|
+
html_head: str = _HTML_DEFAULT_HEAD,
|
|
2010
|
+
):
|
|
2011
|
+
"""Save to HTML."""
|
|
2012
|
+
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
2013
|
+
|
|
2014
|
+
if image_mode == ImageRefMode.REFERENCED:
|
|
2015
|
+
os.makedirs(artifacts_dir, exist_ok=True)
|
|
2016
|
+
|
|
2017
|
+
new_doc = self._make_copy_with_refmode(
|
|
2018
|
+
artifacts_dir, image_mode, reference_path=reference_path
|
|
2019
|
+
)
|
|
2020
|
+
|
|
2021
|
+
html_out = new_doc.export_to_html(
|
|
2022
|
+
from_element=from_element,
|
|
2023
|
+
to_element=to_element,
|
|
2024
|
+
labels=labels,
|
|
2025
|
+
image_mode=image_mode,
|
|
2026
|
+
page_no=page_no,
|
|
2027
|
+
html_lang=html_lang,
|
|
2028
|
+
html_head=html_head,
|
|
2029
|
+
)
|
|
2030
|
+
|
|
2031
|
+
with open(filename, "w") as fw:
|
|
2032
|
+
fw.write(html_out)
|
|
2033
|
+
|
|
2034
|
+
def _get_output_paths(
|
|
2035
|
+
self, filename: Path, artifacts_dir: Optional[Path] = None
|
|
2036
|
+
) -> Tuple[Path, Optional[Path]]:
|
|
2037
|
+
if artifacts_dir is None:
|
|
2038
|
+
# Remove the extension and add '_pictures'
|
|
2039
|
+
artifacts_dir = filename.with_suffix("")
|
|
2040
|
+
artifacts_dir = artifacts_dir.with_name(artifacts_dir.stem + "_artifacts")
|
|
2041
|
+
if artifacts_dir.is_absolute():
|
|
2042
|
+
reference_path = None
|
|
2043
|
+
else:
|
|
2044
|
+
reference_path = filename.parent
|
|
2045
|
+
return artifacts_dir, reference_path
|
|
2046
|
+
|
|
2047
|
+
def _make_copy_with_refmode(
|
|
2048
|
+
self,
|
|
2049
|
+
artifacts_dir: Path,
|
|
2050
|
+
image_mode: ImageRefMode,
|
|
2051
|
+
reference_path: Optional[Path] = None,
|
|
2052
|
+
):
|
|
2053
|
+
new_doc = None
|
|
2054
|
+
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
2055
|
+
new_doc = self
|
|
2056
|
+
elif image_mode == ImageRefMode.REFERENCED:
|
|
2057
|
+
new_doc = self._with_pictures_refs(
|
|
2058
|
+
image_dir=artifacts_dir, reference_path=reference_path
|
|
2059
|
+
)
|
|
2060
|
+
elif image_mode == ImageRefMode.EMBEDDED:
|
|
2061
|
+
new_doc = self._with_embedded_pictures()
|
|
2062
|
+
else:
|
|
2063
|
+
raise ValueError("Unsupported ImageRefMode")
|
|
2064
|
+
return new_doc
|
|
2065
|
+
|
|
2066
|
+
def export_to_html( # noqa: C901
|
|
2067
|
+
self,
|
|
2068
|
+
from_element: int = 0,
|
|
2069
|
+
to_element: int = sys.maxsize,
|
|
2070
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
2071
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2072
|
+
page_no: Optional[int] = None,
|
|
2073
|
+
html_lang: str = "en",
|
|
2074
|
+
html_head: str = _HTML_DEFAULT_HEAD,
|
|
2075
|
+
) -> str:
|
|
2076
|
+
r"""Serialize to HTML."""
|
|
2077
|
+
|
|
2078
|
+
def close_lists(
|
|
2079
|
+
curr_level: int,
|
|
2080
|
+
prev_level: int,
|
|
2081
|
+
in_ordered_list: List[bool],
|
|
2082
|
+
html_texts: list[str],
|
|
2083
|
+
):
|
|
2084
|
+
|
|
2085
|
+
if len(in_ordered_list) == 0:
|
|
2086
|
+
return (in_ordered_list, html_texts)
|
|
2087
|
+
|
|
2088
|
+
while curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2089
|
+
if in_ordered_list[-1]:
|
|
2090
|
+
html_texts.append("</ol>")
|
|
2091
|
+
else:
|
|
2092
|
+
html_texts.append("</ul>")
|
|
2093
|
+
|
|
2094
|
+
prev_level -= 1
|
|
2095
|
+
in_ordered_list.pop() # = in_ordered_list[:-1]
|
|
2096
|
+
|
|
2097
|
+
return (in_ordered_list, html_texts)
|
|
2098
|
+
|
|
2099
|
+
head_lines = ["<!DOCTYPE html>", f'<html lang="{html_lang}">', html_head]
|
|
2100
|
+
html_texts: list[str] = []
|
|
2101
|
+
|
|
2102
|
+
prev_level = 0 # Track the previous item's level
|
|
2103
|
+
|
|
2104
|
+
in_ordered_list: List[bool] = [] # False
|
|
2105
|
+
|
|
2106
|
+
for ix, (item, curr_level) in enumerate(
|
|
2107
|
+
self.iterate_items(self.body, with_groups=True, page_no=page_no)
|
|
2108
|
+
):
|
|
2109
|
+
# If we've moved to a lower level, we're exiting one or more groups
|
|
2110
|
+
if curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2111
|
+
# Calculate how many levels we've exited
|
|
2112
|
+
# level_difference = previous_level - level
|
|
2113
|
+
# Decrement list_nesting_level for each list group we've exited
|
|
2114
|
+
# list_nesting_level = max(0, list_nesting_level - level_difference)
|
|
2115
|
+
|
|
2116
|
+
in_ordered_list, html_texts = close_lists(
|
|
2117
|
+
curr_level=curr_level,
|
|
2118
|
+
prev_level=prev_level,
|
|
2119
|
+
in_ordered_list=in_ordered_list,
|
|
2120
|
+
html_texts=html_texts,
|
|
2121
|
+
)
|
|
2122
|
+
|
|
2123
|
+
prev_level = curr_level # Update previous_level for next iteration
|
|
2124
|
+
|
|
2125
|
+
if ix < from_element or to_element <= ix:
|
|
2126
|
+
continue # skip as many items as you want
|
|
2127
|
+
|
|
2128
|
+
if (isinstance(item, DocItem)) and (item.label not in labels):
|
|
2129
|
+
continue # skip any label that is not whitelisted
|
|
2130
|
+
|
|
2131
|
+
if isinstance(item, GroupItem) and item.label in [
|
|
2132
|
+
GroupLabel.ORDERED_LIST,
|
|
2133
|
+
]:
|
|
2134
|
+
|
|
2135
|
+
text = "<ol>"
|
|
2136
|
+
html_texts.append(text.strip())
|
|
2137
|
+
|
|
2138
|
+
# Increment list nesting level when entering a new list
|
|
2139
|
+
in_ordered_list.append(True)
|
|
2140
|
+
|
|
2141
|
+
elif isinstance(item, GroupItem) and item.label in [
|
|
2142
|
+
GroupLabel.LIST,
|
|
2143
|
+
]:
|
|
2144
|
+
|
|
2145
|
+
text = "<ul>"
|
|
2146
|
+
html_texts.append(text.strip())
|
|
2147
|
+
|
|
2148
|
+
# Increment list nesting level when entering a new list
|
|
2149
|
+
in_ordered_list.append(False)
|
|
2150
|
+
|
|
2151
|
+
elif isinstance(item, GroupItem):
|
|
2152
|
+
continue
|
|
2153
|
+
|
|
2154
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
2155
|
+
|
|
2156
|
+
text = f"<h1>{item.text}</h1>"
|
|
2157
|
+
html_texts.append(text.strip())
|
|
2158
|
+
|
|
2159
|
+
elif isinstance(item, SectionHeaderItem):
|
|
2160
|
+
|
|
2161
|
+
section_level: int = item.level + 1
|
|
2162
|
+
|
|
2163
|
+
text = f"<h{(section_level)}>{item.text}</h{(section_level)}>"
|
|
2164
|
+
html_texts.append(text.strip())
|
|
2165
|
+
|
|
2166
|
+
elif isinstance(item, TextItem) and item.label in [
|
|
2167
|
+
DocItemLabel.SECTION_HEADER
|
|
2168
|
+
]:
|
|
2169
|
+
|
|
2170
|
+
section_level = curr_level
|
|
2171
|
+
|
|
2172
|
+
if section_level <= 1:
|
|
2173
|
+
section_level = 2
|
|
2174
|
+
|
|
2175
|
+
if section_level >= 6:
|
|
2176
|
+
section_level = 6
|
|
2177
|
+
|
|
2178
|
+
text = f"<h{section_level}>{item.text}</h{section_level}>"
|
|
2179
|
+
html_texts.append(text.strip())
|
|
2180
|
+
|
|
2181
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
|
|
2182
|
+
|
|
2183
|
+
text = f"<pre>{item.text}</pre>"
|
|
2184
|
+
html_texts.append(text)
|
|
2185
|
+
|
|
2186
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
2187
|
+
# captions are printed in picture and table ... skipping for now
|
|
2188
|
+
continue
|
|
2189
|
+
|
|
2190
|
+
elif isinstance(item, ListItem):
|
|
2191
|
+
|
|
2192
|
+
text = f"<li>{item.text}</li>"
|
|
2193
|
+
html_texts.append(text)
|
|
2194
|
+
|
|
2195
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2196
|
+
|
|
2197
|
+
text = f"<li>{item.text}</li>"
|
|
2198
|
+
html_texts.append(text)
|
|
2199
|
+
|
|
2200
|
+
elif isinstance(item, TextItem) and item.label in labels:
|
|
2201
|
+
|
|
2202
|
+
text = f"<p>{item.text}</p>"
|
|
2203
|
+
html_texts.append(text.strip())
|
|
2204
|
+
|
|
2205
|
+
elif isinstance(item, TableItem):
|
|
2206
|
+
|
|
2207
|
+
text = item.export_to_html(doc=self, add_caption=True)
|
|
2208
|
+
html_texts.append(text)
|
|
2209
|
+
|
|
2210
|
+
elif isinstance(item, PictureItem):
|
|
2211
|
+
|
|
2212
|
+
html_texts.append(
|
|
2213
|
+
item.export_to_html(
|
|
2214
|
+
doc=self, add_caption=True, image_mode=image_mode
|
|
2215
|
+
)
|
|
2216
|
+
)
|
|
2217
|
+
|
|
2218
|
+
elif isinstance(item, DocItem) and item.label in labels:
|
|
2219
|
+
continue
|
|
2220
|
+
|
|
2221
|
+
html_texts.append("</html>")
|
|
2222
|
+
|
|
2223
|
+
lines = []
|
|
2224
|
+
lines.extend(head_lines)
|
|
2225
|
+
for i, line in enumerate(html_texts):
|
|
2226
|
+
lines.append(line.replace("\n", "<br>"))
|
|
2227
|
+
|
|
2228
|
+
delim = "\n"
|
|
2229
|
+
html_text = (delim.join(lines)).strip()
|
|
2230
|
+
|
|
2231
|
+
return html_text
|
|
2232
|
+
|
|
2233
|
+
def save_as_document_tokens(
|
|
1522
2234
|
self,
|
|
2235
|
+
filename: Path,
|
|
1523
2236
|
delim: str = "\n\n",
|
|
1524
2237
|
from_element: int = 0,
|
|
1525
|
-
to_element:
|
|
2238
|
+
to_element: int = sys.maxsize,
|
|
2239
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
2240
|
+
xsize: int = 100,
|
|
2241
|
+
ysize: int = 100,
|
|
2242
|
+
add_location: bool = True,
|
|
2243
|
+
add_content: bool = True,
|
|
2244
|
+
add_page_index: bool = True,
|
|
2245
|
+
# table specific flags
|
|
2246
|
+
add_table_cell_location: bool = False,
|
|
2247
|
+
add_table_cell_label: bool = True,
|
|
2248
|
+
add_table_cell_text: bool = True,
|
|
2249
|
+
# specifics
|
|
2250
|
+
page_no: Optional[int] = None,
|
|
2251
|
+
with_groups: bool = True,
|
|
2252
|
+
):
|
|
2253
|
+
r"""Save the document content to a DocumentToken format."""
|
|
2254
|
+
out = self.export_to_document_tokens(
|
|
2255
|
+
delim=delim,
|
|
2256
|
+
from_element=from_element,
|
|
2257
|
+
to_element=to_element,
|
|
2258
|
+
labels=labels,
|
|
2259
|
+
xsize=xsize,
|
|
2260
|
+
ysize=ysize,
|
|
2261
|
+
add_location=add_location,
|
|
2262
|
+
add_content=add_content,
|
|
2263
|
+
add_page_index=add_page_index,
|
|
2264
|
+
# table specific flags
|
|
2265
|
+
add_table_cell_location=add_table_cell_location,
|
|
2266
|
+
add_table_cell_label=add_table_cell_label,
|
|
2267
|
+
add_table_cell_text=add_table_cell_text,
|
|
2268
|
+
# specifics
|
|
2269
|
+
page_no=page_no,
|
|
2270
|
+
with_groups=with_groups,
|
|
2271
|
+
)
|
|
2272
|
+
|
|
2273
|
+
with open(filename, "w") as fw:
|
|
2274
|
+
fw.write(out)
|
|
2275
|
+
|
|
2276
|
+
def export_to_document_tokens(
|
|
2277
|
+
self,
|
|
2278
|
+
delim: str = "\n",
|
|
2279
|
+
from_element: int = 0,
|
|
2280
|
+
to_element: int = sys.maxsize,
|
|
1526
2281
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
1527
2282
|
xsize: int = 100,
|
|
1528
2283
|
ysize: int = 100,
|
|
@@ -1533,8 +2288,12 @@ class DoclingDocument(BaseModel):
|
|
|
1533
2288
|
add_table_cell_location: bool = False,
|
|
1534
2289
|
add_table_cell_label: bool = True,
|
|
1535
2290
|
add_table_cell_text: bool = True,
|
|
2291
|
+
# specifics
|
|
2292
|
+
page_no: Optional[int] = None,
|
|
2293
|
+
with_groups: bool = True,
|
|
2294
|
+
newline: bool = True,
|
|
1536
2295
|
) -> str:
|
|
1537
|
-
r"""Exports the document content to
|
|
2296
|
+
r"""Exports the document content to a DocumentToken format.
|
|
1538
2297
|
|
|
1539
2298
|
Operates on a slice of the document's body as defined through arguments
|
|
1540
2299
|
from_element and to_element; defaulting to the whole main_text.
|
|
@@ -1554,44 +2313,102 @@ class DoclingDocument(BaseModel):
|
|
|
1554
2313
|
:returns: The content of the document formatted as a DocTags string.
|
|
1555
2314
|
:rtype: str
|
|
1556
2315
|
"""
|
|
1557
|
-
new_line = ""
|
|
1558
|
-
if delim:
|
|
1559
|
-
new_line = "\n"
|
|
1560
2316
|
|
|
1561
|
-
|
|
2317
|
+
def close_lists(
|
|
2318
|
+
curr_level: int,
|
|
2319
|
+
prev_level: int,
|
|
2320
|
+
in_ordered_list: List[bool],
|
|
2321
|
+
result: str,
|
|
2322
|
+
delim: str,
|
|
2323
|
+
):
|
|
2324
|
+
|
|
2325
|
+
if len(in_ordered_list) == 0:
|
|
2326
|
+
return (in_ordered_list, result)
|
|
2327
|
+
|
|
2328
|
+
while curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2329
|
+
if in_ordered_list[-1]:
|
|
2330
|
+
result += f"</ordered_list>{delim}"
|
|
2331
|
+
else:
|
|
2332
|
+
result += f"</unordered_list>{delim}"
|
|
1562
2333
|
|
|
1563
|
-
|
|
2334
|
+
prev_level -= 1
|
|
2335
|
+
in_ordered_list.pop() # = in_ordered_list[:-1]
|
|
2336
|
+
|
|
2337
|
+
return (in_ordered_list, result)
|
|
2338
|
+
|
|
2339
|
+
if newline:
|
|
2340
|
+
delim = "\n"
|
|
2341
|
+
else:
|
|
2342
|
+
delim = ""
|
|
1564
2343
|
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
2344
|
+
prev_level = 0 # Track the previous item's level
|
|
2345
|
+
|
|
2346
|
+
in_ordered_list: List[bool] = [] # False
|
|
2347
|
+
|
|
2348
|
+
result = f"{DocumentToken.BEG_DOCUMENT.value}{delim}"
|
|
2349
|
+
|
|
2350
|
+
for ix, (item, curr_level) in enumerate(
|
|
2351
|
+
self.iterate_items(self.body, with_groups=True)
|
|
2352
|
+
):
|
|
2353
|
+
|
|
2354
|
+
# If we've moved to a lower level, we're exiting one or more groups
|
|
2355
|
+
if curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2356
|
+
# Calculate how many levels we've exited
|
|
2357
|
+
# level_difference = previous_level - level
|
|
2358
|
+
# Decrement list_nesting_level for each list group we've exited
|
|
2359
|
+
# list_nesting_level = max(0, list_nesting_level - level_difference)
|
|
2360
|
+
|
|
2361
|
+
in_ordered_list, result = close_lists(
|
|
2362
|
+
curr_level=curr_level,
|
|
2363
|
+
prev_level=prev_level,
|
|
2364
|
+
in_ordered_list=in_ordered_list,
|
|
2365
|
+
result=result,
|
|
2366
|
+
delim=delim,
|
|
2367
|
+
)
|
|
2368
|
+
|
|
2369
|
+
prev_level = curr_level # Update previous_level for next iteration
|
|
2370
|
+
|
|
2371
|
+
if ix < from_element or to_element <= ix:
|
|
1569
2372
|
continue # skip as many items as you want
|
|
1570
2373
|
|
|
1571
|
-
if
|
|
1572
|
-
|
|
2374
|
+
if (isinstance(item, DocItem)) and (item.label not in labels):
|
|
2375
|
+
continue # skip any label that is not whitelisted
|
|
1573
2376
|
|
|
1574
|
-
if
|
|
1575
|
-
|
|
2377
|
+
if isinstance(item, GroupItem) and item.label in [
|
|
2378
|
+
GroupLabel.ORDERED_LIST,
|
|
2379
|
+
]:
|
|
1576
2380
|
|
|
1577
|
-
|
|
2381
|
+
result += f"<ordered_list>{delim}"
|
|
2382
|
+
in_ordered_list.append(True)
|
|
1578
2383
|
|
|
1579
|
-
|
|
2384
|
+
elif isinstance(item, GroupItem) and item.label in [
|
|
2385
|
+
GroupLabel.LIST,
|
|
2386
|
+
]:
|
|
1580
2387
|
|
|
1581
|
-
|
|
2388
|
+
result += f"<unordered_list>{delim}"
|
|
2389
|
+
in_ordered_list.append(False)
|
|
1582
2390
|
|
|
1583
|
-
|
|
1584
|
-
|
|
2391
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
2392
|
+
# captions are printed in picture and table ... skipping for now
|
|
2393
|
+
continue
|
|
1585
2394
|
|
|
1586
|
-
|
|
1587
|
-
|
|
2395
|
+
elif isinstance(item, SectionHeaderItem):
|
|
2396
|
+
|
|
2397
|
+
result += item.export_to_document_tokens(
|
|
2398
|
+
doc=self,
|
|
2399
|
+
new_line=delim,
|
|
2400
|
+
xsize=xsize,
|
|
2401
|
+
ysize=ysize,
|
|
2402
|
+
add_location=add_location,
|
|
2403
|
+
add_content=add_content,
|
|
2404
|
+
add_page_index=add_page_index,
|
|
2405
|
+
)
|
|
1588
2406
|
|
|
1589
|
-
|
|
1590
|
-
if isinstance(item, TextItem) and (item_type in labels):
|
|
2407
|
+
elif isinstance(item, TextItem) and (item.label in labels):
|
|
1591
2408
|
|
|
1592
|
-
|
|
2409
|
+
result += item.export_to_document_tokens(
|
|
1593
2410
|
doc=self,
|
|
1594
|
-
new_line=
|
|
2411
|
+
new_line=delim,
|
|
1595
2412
|
xsize=xsize,
|
|
1596
2413
|
ysize=ysize,
|
|
1597
2414
|
add_location=add_location,
|
|
@@ -1599,11 +2416,11 @@ class DoclingDocument(BaseModel):
|
|
|
1599
2416
|
add_page_index=add_page_index,
|
|
1600
2417
|
)
|
|
1601
2418
|
|
|
1602
|
-
elif isinstance(item, TableItem) and (
|
|
2419
|
+
elif isinstance(item, TableItem) and (item.label in labels):
|
|
1603
2420
|
|
|
1604
|
-
|
|
2421
|
+
result += item.export_to_document_tokens(
|
|
1605
2422
|
doc=self,
|
|
1606
|
-
new_line=
|
|
2423
|
+
new_line=delim,
|
|
1607
2424
|
xsize=xsize,
|
|
1608
2425
|
ysize=ysize,
|
|
1609
2426
|
add_caption=True,
|
|
@@ -1615,11 +2432,11 @@ class DoclingDocument(BaseModel):
|
|
|
1615
2432
|
add_page_index=add_page_index,
|
|
1616
2433
|
)
|
|
1617
2434
|
|
|
1618
|
-
elif isinstance(item, PictureItem) and (
|
|
2435
|
+
elif isinstance(item, PictureItem) and (item.label in labels):
|
|
1619
2436
|
|
|
1620
|
-
|
|
2437
|
+
result += item.export_to_document_tokens(
|
|
1621
2438
|
doc=self,
|
|
1622
|
-
new_line=
|
|
2439
|
+
new_line=delim,
|
|
1623
2440
|
xsize=xsize,
|
|
1624
2441
|
ysize=ysize,
|
|
1625
2442
|
add_caption=True,
|
|
@@ -1628,9 +2445,9 @@ class DoclingDocument(BaseModel):
|
|
|
1628
2445
|
add_page_index=add_page_index,
|
|
1629
2446
|
)
|
|
1630
2447
|
|
|
1631
|
-
|
|
2448
|
+
result += DocumentToken.END_DOCUMENT.value
|
|
1632
2449
|
|
|
1633
|
-
return
|
|
2450
|
+
return result
|
|
1634
2451
|
|
|
1635
2452
|
def _export_to_indented_text(
|
|
1636
2453
|
self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
|
docling_core/utils/file.py
CHANGED
|
@@ -65,3 +65,43 @@ def resolve_file_source(
|
|
|
65
65
|
except ValidationError:
|
|
66
66
|
raise ValueError(f"Unexpected source type encountered: {type(source)}")
|
|
67
67
|
return local_path
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def relative_path(src: Path, target: Path) -> Path:
|
|
71
|
+
"""Compute the relative path from `src` to `target`.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
src (str | Path): The source directory or file path (must be absolute).
|
|
75
|
+
target (str | Path): The target directory or file path (must be absolute).
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Path: The relative path from `src` to `target`.
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
ValueError: If either `src` or `target` is not an absolute path.
|
|
82
|
+
"""
|
|
83
|
+
src = Path(src).resolve()
|
|
84
|
+
target = Path(target).resolve()
|
|
85
|
+
|
|
86
|
+
# Ensure both paths are absolute
|
|
87
|
+
if not src.is_absolute():
|
|
88
|
+
raise ValueError(f"The source path must be absolute: {src}")
|
|
89
|
+
if not target.is_absolute():
|
|
90
|
+
raise ValueError(f"The target path must be absolute: {target}")
|
|
91
|
+
|
|
92
|
+
# Find the common ancestor
|
|
93
|
+
common_parts = []
|
|
94
|
+
for src_part, target_part in zip(src.parts, target.parts):
|
|
95
|
+
if src_part == target_part:
|
|
96
|
+
common_parts.append(src_part)
|
|
97
|
+
else:
|
|
98
|
+
break
|
|
99
|
+
|
|
100
|
+
# Determine the path to go up from src to the common ancestor
|
|
101
|
+
up_segments = [".."] * (len(src.parts) - len(common_parts))
|
|
102
|
+
|
|
103
|
+
# Add the path from the common ancestor to the target
|
|
104
|
+
down_segments = target.parts[len(common_parts) :]
|
|
105
|
+
|
|
106
|
+
# Combine and return the result
|
|
107
|
+
return Path(*up_segments, *down_segments)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -29,7 +29,8 @@ Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
|
29
29
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
30
30
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
31
31
|
Requires-Dist: pillow (>=10.3.0,<11.0.0)
|
|
32
|
-
Requires-Dist: pydantic (>=2.6.0,<
|
|
32
|
+
Requires-Dist: pydantic (>=2.6.0,<2.10)
|
|
33
|
+
Requires-Dist: pyyaml (>=5.1,<7.0.0)
|
|
33
34
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
34
35
|
Project-URL: Repository, https://github.com/DS4SD/docling-core
|
|
35
36
|
Description-Content-Type: text/markdown
|
|
@@ -20,8 +20,8 @@ docling_core/transforms/chunker/hierarchical_chunker.py,sha256=V4FiOYqL0GgBqVB7x
|
|
|
20
20
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
21
21
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
22
22
|
docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
|
|
23
|
-
docling_core/types/doc/base.py,sha256=
|
|
24
|
-
docling_core/types/doc/document.py,sha256=
|
|
23
|
+
docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
|
|
24
|
+
docling_core/types/doc/document.py,sha256=05q8KZ64TVpxJoegPy7MOlvI0fmqUtKRKZMGsdvUz9c,85711
|
|
25
25
|
docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
|
|
26
26
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
27
27
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
@@ -44,13 +44,13 @@ docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiK
|
|
|
44
44
|
docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
|
|
45
45
|
docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
|
|
46
46
|
docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
|
|
47
|
-
docling_core/utils/file.py,sha256=
|
|
47
|
+
docling_core/utils/file.py,sha256=ug4-z0KuthkEb_d5YDRPbY79PWfNSj9GYsi16xF2sDA,3699
|
|
48
48
|
docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
|
|
49
49
|
docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
|
|
50
50
|
docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
|
|
51
51
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
52
|
-
docling_core-2.
|
|
53
|
-
docling_core-2.
|
|
54
|
-
docling_core-2.
|
|
55
|
-
docling_core-2.
|
|
56
|
-
docling_core-2.
|
|
52
|
+
docling_core-2.5.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
53
|
+
docling_core-2.5.0.dist-info/METADATA,sha256=u4KdNbLkumFHT5HFI7XZo9AUeYryHHkH8iYpDDInA7Q,5468
|
|
54
|
+
docling_core-2.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
55
|
+
docling_core-2.5.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
|
|
56
|
+
docling_core-2.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|