docling-core 2.23.3__py3-none-any.whl → 2.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +2 -2
- docling_core/experimental/serializer/common.py +250 -196
- docling_core/experimental/serializer/doctags.py +492 -0
- docling_core/experimental/serializer/markdown.py +70 -41
- docling_core/types/doc/document.py +412 -418
- docling_core/types/doc/page.py +18 -6
- docling_core/types/doc/tokens.py +192 -26
- {docling_core-2.23.3.dist-info → docling_core-2.24.0.dist-info}/METADATA +1 -1
- {docling_core-2.23.3.dist-info → docling_core-2.24.0.dist-info}/RECORD +12 -11
- {docling_core-2.23.3.dist-info → docling_core-2.24.0.dist-info}/LICENSE +0 -0
- {docling_core-2.23.3.dist-info → docling_core-2.24.0.dist-info}/WHEEL +0 -0
- {docling_core-2.23.3.dist-info → docling_core-2.24.0.dist-info}/entry_points.txt +0 -0
|
@@ -50,7 +50,7 @@ from docling_core.types.doc.labels import (
|
|
|
50
50
|
GraphLinkLabel,
|
|
51
51
|
GroupLabel,
|
|
52
52
|
)
|
|
53
|
-
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
53
|
+
from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
|
|
54
54
|
from docling_core.types.doc.utils import (
|
|
55
55
|
get_html_tag_with_text_direction,
|
|
56
56
|
get_text_direction,
|
|
@@ -79,6 +79,7 @@ DEFAULT_EXPORT_LABELS = {
|
|
|
79
79
|
DocItemLabel.REFERENCE,
|
|
80
80
|
DocItemLabel.PAGE_HEADER,
|
|
81
81
|
DocItemLabel.PAGE_FOOTER,
|
|
82
|
+
DocItemLabel.KEY_VALUE_REGION,
|
|
82
83
|
}
|
|
83
84
|
|
|
84
85
|
DOCUMENT_TOKENS_EXPORT_LABELS = DEFAULT_EXPORT_LABELS.copy()
|
|
@@ -414,6 +415,7 @@ class DocumentOrigin(BaseModel):
|
|
|
414
415
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
415
416
|
"text/asciidoc",
|
|
416
417
|
"text/markdown",
|
|
418
|
+
"text/csv",
|
|
417
419
|
]
|
|
418
420
|
|
|
419
421
|
@field_validator("binary_hash", mode="before")
|
|
@@ -643,7 +645,7 @@ class DocItem(
|
|
|
643
645
|
def get_location_tokens(
|
|
644
646
|
self,
|
|
645
647
|
doc: "DoclingDocument",
|
|
646
|
-
new_line: str,
|
|
648
|
+
new_line: str = "", # deprecated
|
|
647
649
|
xsize: int = 500,
|
|
648
650
|
ysize: int = 500,
|
|
649
651
|
) -> str:
|
|
@@ -662,7 +664,7 @@ class DocItem(
|
|
|
662
664
|
xsize=xsize,
|
|
663
665
|
ysize=ysize,
|
|
664
666
|
)
|
|
665
|
-
location +=
|
|
667
|
+
location += loc_str
|
|
666
668
|
|
|
667
669
|
return location
|
|
668
670
|
|
|
@@ -722,10 +724,15 @@ class TextItem(DocItem):
|
|
|
722
724
|
formatting: Optional[Formatting] = None
|
|
723
725
|
hyperlink: Optional[Union[AnyUrl, Path]] = None
|
|
724
726
|
|
|
725
|
-
|
|
727
|
+
@deprecated("Use export_to_doctags() instead.")
|
|
728
|
+
def export_to_document_tokens(self, *args, **kwargs):
|
|
729
|
+
r"""Export to DocTags format."""
|
|
730
|
+
return self.export_to_doctags(*args, **kwargs)
|
|
731
|
+
|
|
732
|
+
def export_to_doctags(
|
|
726
733
|
self,
|
|
727
734
|
doc: "DoclingDocument",
|
|
728
|
-
new_line: str = "",
|
|
735
|
+
new_line: str = "", # deprecated
|
|
729
736
|
xsize: int = 500,
|
|
730
737
|
ysize: int = 500,
|
|
731
738
|
add_location: bool = True,
|
|
@@ -734,29 +741,29 @@ class TextItem(DocItem):
|
|
|
734
741
|
r"""Export text element to document tokens format.
|
|
735
742
|
|
|
736
743
|
:param doc: "DoclingDocument":
|
|
737
|
-
:param new_line: str (Default value = "")
|
|
744
|
+
:param new_line: str (Default value = "") Deprecated
|
|
738
745
|
:param xsize: int: (Default value = 500)
|
|
739
746
|
:param ysize: int: (Default value = 500)
|
|
740
747
|
:param add_location: bool: (Default value = True)
|
|
741
748
|
:param add_content: bool: (Default value = True)
|
|
742
749
|
|
|
743
750
|
"""
|
|
744
|
-
|
|
751
|
+
from docling_core.experimental.serializer.doctags import (
|
|
752
|
+
DocTagsDocSerializer,
|
|
753
|
+
DocTagsParams,
|
|
754
|
+
)
|
|
745
755
|
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
new_line=new_line,
|
|
756
|
+
serializer = DocTagsDocSerializer(
|
|
757
|
+
doc=doc,
|
|
758
|
+
params=DocTagsParams(
|
|
750
759
|
xsize=xsize,
|
|
751
760
|
ysize=ysize,
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
return body
|
|
761
|
+
add_location=add_location,
|
|
762
|
+
add_content=add_content,
|
|
763
|
+
),
|
|
764
|
+
)
|
|
765
|
+
text = serializer.serialize(item=self).text
|
|
766
|
+
return text
|
|
760
767
|
|
|
761
768
|
|
|
762
769
|
class TitleItem(TextItem):
|
|
@@ -775,10 +782,15 @@ class SectionHeaderItem(TextItem):
|
|
|
775
782
|
)
|
|
776
783
|
level: LevelNumber = 1
|
|
777
784
|
|
|
778
|
-
|
|
785
|
+
@deprecated("Use export_to_doctags() instead.")
|
|
786
|
+
def export_to_document_tokens(self, *args, **kwargs):
|
|
787
|
+
r"""Export to DocTags format."""
|
|
788
|
+
return self.export_to_doctags(*args, **kwargs)
|
|
789
|
+
|
|
790
|
+
def export_to_doctags(
|
|
779
791
|
self,
|
|
780
792
|
doc: "DoclingDocument",
|
|
781
|
-
new_line: str = "",
|
|
793
|
+
new_line: str = "", # deprecated
|
|
782
794
|
xsize: int = 500,
|
|
783
795
|
ysize: int = 500,
|
|
784
796
|
add_location: bool = True,
|
|
@@ -787,34 +799,29 @@ class SectionHeaderItem(TextItem):
|
|
|
787
799
|
r"""Export text element to document tokens format.
|
|
788
800
|
|
|
789
801
|
:param doc: "DoclingDocument":
|
|
790
|
-
:param new_line: str (Default value = "")
|
|
802
|
+
:param new_line: str (Default value = "") Deprecated
|
|
791
803
|
:param xsize: int: (Default value = 500)
|
|
792
804
|
:param ysize: int: (Default value = 500)
|
|
793
805
|
:param add_location: bool: (Default value = True)
|
|
794
806
|
:param add_content: bool: (Default value = True)
|
|
795
807
|
|
|
796
808
|
"""
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
# body
|
|
802
|
-
# ), f"failed DocumentToken.is_known_token({body})"
|
|
809
|
+
from docling_core.experimental.serializer.doctags import (
|
|
810
|
+
DocTagsDocSerializer,
|
|
811
|
+
DocTagsParams,
|
|
812
|
+
)
|
|
803
813
|
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
new_line=new_line,
|
|
814
|
+
serializer = DocTagsDocSerializer(
|
|
815
|
+
doc=doc,
|
|
816
|
+
params=DocTagsParams(
|
|
808
817
|
xsize=xsize,
|
|
809
818
|
ysize=ysize,
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
return body
|
|
819
|
+
add_location=add_location,
|
|
820
|
+
add_content=add_content,
|
|
821
|
+
),
|
|
822
|
+
)
|
|
823
|
+
text = serializer.serialize(item=self).text
|
|
824
|
+
return text
|
|
818
825
|
|
|
819
826
|
|
|
820
827
|
class ListItem(TextItem):
|
|
@@ -865,10 +872,15 @@ class CodeItem(FloatingItem, TextItem):
|
|
|
865
872
|
)
|
|
866
873
|
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
|
|
867
874
|
|
|
868
|
-
|
|
875
|
+
@deprecated("Use export_to_doctags() instead.")
|
|
876
|
+
def export_to_document_tokens(self, *args, **kwargs):
|
|
877
|
+
r"""Export to DocTags format."""
|
|
878
|
+
return self.export_to_doctags(*args, **kwargs)
|
|
879
|
+
|
|
880
|
+
def export_to_doctags(
|
|
869
881
|
self,
|
|
870
882
|
doc: "DoclingDocument",
|
|
871
|
-
new_line: str = "",
|
|
883
|
+
new_line: str = "", # deprecated
|
|
872
884
|
xsize: int = 500,
|
|
873
885
|
ysize: int = 500,
|
|
874
886
|
add_location: bool = True,
|
|
@@ -877,29 +889,29 @@ class CodeItem(FloatingItem, TextItem):
|
|
|
877
889
|
r"""Export text element to document tokens format.
|
|
878
890
|
|
|
879
891
|
:param doc: "DoclingDocument":
|
|
880
|
-
:param new_line: str (Default value = "")
|
|
892
|
+
:param new_line: str (Default value = "") Deprecated
|
|
881
893
|
:param xsize: int: (Default value = 500)
|
|
882
894
|
:param ysize: int: (Default value = 500)
|
|
883
895
|
:param add_location: bool: (Default value = True)
|
|
884
896
|
:param add_content: bool: (Default value = True)
|
|
885
897
|
|
|
886
898
|
"""
|
|
887
|
-
|
|
899
|
+
from docling_core.experimental.serializer.doctags import (
|
|
900
|
+
DocTagsDocSerializer,
|
|
901
|
+
DocTagsParams,
|
|
902
|
+
)
|
|
888
903
|
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
new_line=new_line,
|
|
904
|
+
serializer = DocTagsDocSerializer(
|
|
905
|
+
doc=doc,
|
|
906
|
+
params=DocTagsParams(
|
|
893
907
|
xsize=xsize,
|
|
894
908
|
ysize=ysize,
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
return body
|
|
909
|
+
add_location=add_location,
|
|
910
|
+
add_content=add_content,
|
|
911
|
+
),
|
|
912
|
+
)
|
|
913
|
+
text = serializer.serialize(item=self).text
|
|
914
|
+
return text
|
|
903
915
|
|
|
904
916
|
|
|
905
917
|
class FormulaItem(TextItem):
|
|
@@ -953,7 +965,10 @@ class PictureItem(FloatingItem):
|
|
|
953
965
|
image_placeholder: str = "<!-- image -->",
|
|
954
966
|
) -> str:
|
|
955
967
|
"""Export picture to Markdown format."""
|
|
956
|
-
from docling_core.experimental.serializer.markdown import
|
|
968
|
+
from docling_core.experimental.serializer.markdown import (
|
|
969
|
+
MarkdownDocSerializer,
|
|
970
|
+
MarkdownParams,
|
|
971
|
+
)
|
|
957
972
|
|
|
958
973
|
if not add_caption:
|
|
959
974
|
_logger.warning(
|
|
@@ -961,20 +976,13 @@ class PictureItem(FloatingItem):
|
|
|
961
976
|
)
|
|
962
977
|
|
|
963
978
|
serializer = MarkdownDocSerializer(
|
|
964
|
-
doc=
|
|
965
|
-
|
|
966
|
-
)
|
|
967
|
-
text = (
|
|
968
|
-
serializer.picture_serializer.serialize(
|
|
969
|
-
item=self,
|
|
970
|
-
doc_serializer=serializer,
|
|
971
|
-
doc=doc,
|
|
979
|
+
doc=doc,
|
|
980
|
+
params=MarkdownParams(
|
|
972
981
|
image_mode=image_mode,
|
|
973
982
|
image_placeholder=image_placeholder,
|
|
974
|
-
)
|
|
975
|
-
if serializer.picture_serializer
|
|
976
|
-
else ""
|
|
983
|
+
),
|
|
977
984
|
)
|
|
985
|
+
text = serializer.serialize(item=self).text
|
|
978
986
|
return text
|
|
979
987
|
|
|
980
988
|
def export_to_html(
|
|
@@ -1033,10 +1041,15 @@ class PictureItem(FloatingItem):
|
|
|
1033
1041
|
else:
|
|
1034
1042
|
return default_response
|
|
1035
1043
|
|
|
1036
|
-
|
|
1044
|
+
@deprecated("Use export_to_doctags() instead.")
|
|
1045
|
+
def export_to_document_tokens(self, *args, **kwargs):
|
|
1046
|
+
r"""Export to DocTags format."""
|
|
1047
|
+
return self.export_to_doctags(*args, **kwargs)
|
|
1048
|
+
|
|
1049
|
+
def export_to_doctags(
|
|
1037
1050
|
self,
|
|
1038
1051
|
doc: "DoclingDocument",
|
|
1039
|
-
new_line: str = "",
|
|
1052
|
+
new_line: str = "", # deprecated
|
|
1040
1053
|
xsize: int = 500,
|
|
1041
1054
|
ysize: int = 500,
|
|
1042
1055
|
add_location: bool = True,
|
|
@@ -1046,7 +1059,7 @@ class PictureItem(FloatingItem):
|
|
|
1046
1059
|
r"""Export picture to document tokens format.
|
|
1047
1060
|
|
|
1048
1061
|
:param doc: "DoclingDocument":
|
|
1049
|
-
:param new_line: str (Default value = "")
|
|
1062
|
+
:param new_line: str (Default value = "") Deprecated
|
|
1050
1063
|
:param xsize: int: (Default value = 500)
|
|
1051
1064
|
:param ysize: int: (Default value = 500)
|
|
1052
1065
|
:param add_location: bool: (Default value = True)
|
|
@@ -1055,59 +1068,23 @@ class PictureItem(FloatingItem):
|
|
|
1055
1068
|
:param # not used at the moment
|
|
1056
1069
|
|
|
1057
1070
|
"""
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1071
|
+
from docling_core.experimental.serializer.doctags import (
|
|
1072
|
+
DocTagsDocSerializer,
|
|
1073
|
+
DocTagsParams,
|
|
1074
|
+
)
|
|
1075
|
+
|
|
1076
|
+
serializer = DocTagsDocSerializer(
|
|
1077
|
+
doc=doc,
|
|
1078
|
+
params=DocTagsParams(
|
|
1063
1079
|
xsize=xsize,
|
|
1064
1080
|
ysize=ysize,
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
if len(classifications) > 0:
|
|
1073
|
-
# ! TODO: currently this code assumes class_name is of type 'str'
|
|
1074
|
-
# ! TODO: when it will change to an ENUM --> adapt code
|
|
1075
|
-
predicted_class = classifications[0].predicted_classes[0].class_name
|
|
1076
|
-
body += DocumentToken.get_picture_classification_token(predicted_class)
|
|
1077
|
-
|
|
1078
|
-
smiles_annotations = [
|
|
1079
|
-
ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
|
|
1080
|
-
]
|
|
1081
|
-
if len(smiles_annotations) > 0:
|
|
1082
|
-
body += (
|
|
1083
|
-
"<"
|
|
1084
|
-
+ DocumentToken.SMILES.value
|
|
1085
|
-
+ ">"
|
|
1086
|
-
+ smiles_annotations[0].smi
|
|
1087
|
-
+ "</"
|
|
1088
|
-
+ DocumentToken.SMILES.value
|
|
1089
|
-
+ ">"
|
|
1090
|
-
)
|
|
1091
|
-
|
|
1092
|
-
if add_caption and len(self.captions):
|
|
1093
|
-
text = self.caption_text(doc)
|
|
1094
|
-
|
|
1095
|
-
if len(text):
|
|
1096
|
-
body += f"<{DocItemLabel.CAPTION.value}>"
|
|
1097
|
-
for caption in self.captions:
|
|
1098
|
-
body += caption.resolve(doc).get_location_tokens(
|
|
1099
|
-
doc=doc,
|
|
1100
|
-
new_line=new_line,
|
|
1101
|
-
xsize=xsize,
|
|
1102
|
-
ysize=ysize,
|
|
1103
|
-
)
|
|
1104
|
-
body += f"{text.strip()}"
|
|
1105
|
-
body += f"</{DocItemLabel.CAPTION.value}>"
|
|
1106
|
-
body += f"{new_line}"
|
|
1107
|
-
|
|
1108
|
-
body += f"</{self.label.value}>\n"
|
|
1109
|
-
|
|
1110
|
-
return body
|
|
1081
|
+
add_location=add_location,
|
|
1082
|
+
add_content=add_content,
|
|
1083
|
+
add_caption=add_caption,
|
|
1084
|
+
),
|
|
1085
|
+
)
|
|
1086
|
+
text = serializer.serialize(item=self).text
|
|
1087
|
+
return text
|
|
1111
1088
|
|
|
1112
1089
|
|
|
1113
1090
|
class TableItem(FloatingItem):
|
|
@@ -1171,18 +1148,8 @@ class TableItem(FloatingItem):
|
|
|
1171
1148
|
MarkdownDocSerializer,
|
|
1172
1149
|
)
|
|
1173
1150
|
|
|
1174
|
-
serializer = MarkdownDocSerializer(
|
|
1175
|
-
|
|
1176
|
-
)
|
|
1177
|
-
text = (
|
|
1178
|
-
serializer.table_serializer.serialize(
|
|
1179
|
-
item=self,
|
|
1180
|
-
doc_serializer=serializer,
|
|
1181
|
-
doc=doc,
|
|
1182
|
-
).text
|
|
1183
|
-
if serializer.table_serializer
|
|
1184
|
-
else ""
|
|
1185
|
-
)
|
|
1151
|
+
serializer = MarkdownDocSerializer(doc=doc)
|
|
1152
|
+
text = serializer.serialize(item=self).text
|
|
1186
1153
|
return text
|
|
1187
1154
|
else:
|
|
1188
1155
|
_logger.warning(
|
|
@@ -1391,10 +1358,15 @@ class TableItem(FloatingItem):
|
|
|
1391
1358
|
body_str = "".join(body)
|
|
1392
1359
|
return body_str
|
|
1393
1360
|
|
|
1394
|
-
|
|
1361
|
+
@deprecated("Use export_to_doctags() instead.")
|
|
1362
|
+
def export_to_document_tokens(self, *args, **kwargs):
|
|
1363
|
+
r"""Export to DocTags format."""
|
|
1364
|
+
return self.export_to_doctags(*args, **kwargs)
|
|
1365
|
+
|
|
1366
|
+
def export_to_doctags(
|
|
1395
1367
|
self,
|
|
1396
1368
|
doc: "DoclingDocument",
|
|
1397
|
-
new_line: str = "",
|
|
1369
|
+
new_line: str = "", # deprecated
|
|
1398
1370
|
xsize: int = 500,
|
|
1399
1371
|
ysize: int = 500,
|
|
1400
1372
|
add_location: bool = True,
|
|
@@ -1405,7 +1377,7 @@ class TableItem(FloatingItem):
|
|
|
1405
1377
|
r"""Export table to document tokens format.
|
|
1406
1378
|
|
|
1407
1379
|
:param doc: "DoclingDocument":
|
|
1408
|
-
:param new_line: str (Default value = "")
|
|
1380
|
+
:param new_line: str (Default value = "") Deprecated
|
|
1409
1381
|
:param xsize: int: (Default value = 500)
|
|
1410
1382
|
:param ysize: int: (Default value = 500)
|
|
1411
1383
|
:param add_location: bool: (Default value = True)
|
|
@@ -1414,39 +1386,24 @@ class TableItem(FloatingItem):
|
|
|
1414
1386
|
:param add_caption: bool: (Default value = True)
|
|
1415
1387
|
|
|
1416
1388
|
"""
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1389
|
+
from docling_core.experimental.serializer.doctags import (
|
|
1390
|
+
DocTagsDocSerializer,
|
|
1391
|
+
DocTagsParams,
|
|
1392
|
+
)
|
|
1420
1393
|
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
new_line=new_line,
|
|
1394
|
+
serializer = DocTagsDocSerializer(
|
|
1395
|
+
doc=doc,
|
|
1396
|
+
params=DocTagsParams(
|
|
1425
1397
|
xsize=xsize,
|
|
1426
1398
|
ysize=ysize,
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
body += f"<{DocItemLabel.CAPTION.value}>"
|
|
1436
|
-
for caption in self.captions:
|
|
1437
|
-
body += caption.resolve(doc).get_location_tokens(
|
|
1438
|
-
doc=doc,
|
|
1439
|
-
new_line=new_line,
|
|
1440
|
-
xsize=xsize,
|
|
1441
|
-
ysize=ysize,
|
|
1442
|
-
)
|
|
1443
|
-
body += f"{text.strip()}"
|
|
1444
|
-
body += f"</{DocItemLabel.CAPTION.value}>"
|
|
1445
|
-
body += f"{new_line}"
|
|
1446
|
-
|
|
1447
|
-
body += f"</{otsl_tag}>\n"
|
|
1448
|
-
|
|
1449
|
-
return body
|
|
1399
|
+
add_location=add_location,
|
|
1400
|
+
add_caption=add_caption,
|
|
1401
|
+
add_table_cell_location=add_cell_location,
|
|
1402
|
+
add_table_cell_text=add_cell_text,
|
|
1403
|
+
),
|
|
1404
|
+
)
|
|
1405
|
+
text = serializer.serialize(item=self).text
|
|
1406
|
+
return text
|
|
1450
1407
|
|
|
1451
1408
|
|
|
1452
1409
|
class GraphCell(BaseModel):
|
|
@@ -1508,6 +1465,42 @@ class KeyValueItem(FloatingItem):
|
|
|
1508
1465
|
|
|
1509
1466
|
graph: GraphData
|
|
1510
1467
|
|
|
1468
|
+
def export_to_document_tokens(
|
|
1469
|
+
self,
|
|
1470
|
+
doc: "DoclingDocument",
|
|
1471
|
+
new_line: str = "", # deprecated
|
|
1472
|
+
xsize: int = 500,
|
|
1473
|
+
ysize: int = 500,
|
|
1474
|
+
add_location: bool = True,
|
|
1475
|
+
add_content: bool = True,
|
|
1476
|
+
):
|
|
1477
|
+
r"""Export key value item to document tokens format.
|
|
1478
|
+
|
|
1479
|
+
:param doc: "DoclingDocument":
|
|
1480
|
+
:param new_line: str (Default value = "") Deprecated
|
|
1481
|
+
:param xsize: int: (Default value = 500)
|
|
1482
|
+
:param ysize: int: (Default value = 500)
|
|
1483
|
+
:param add_location: bool: (Default value = True)
|
|
1484
|
+
:param add_content: bool: (Default value = True)
|
|
1485
|
+
|
|
1486
|
+
"""
|
|
1487
|
+
from docling_core.experimental.serializer.doctags import (
|
|
1488
|
+
DocTagsDocSerializer,
|
|
1489
|
+
DocTagsParams,
|
|
1490
|
+
)
|
|
1491
|
+
|
|
1492
|
+
serializer = DocTagsDocSerializer(
|
|
1493
|
+
doc=doc,
|
|
1494
|
+
params=DocTagsParams(
|
|
1495
|
+
xsize=xsize,
|
|
1496
|
+
ysize=ysize,
|
|
1497
|
+
add_location=add_location,
|
|
1498
|
+
add_content=add_content,
|
|
1499
|
+
),
|
|
1500
|
+
)
|
|
1501
|
+
text = serializer.serialize(item=self).text
|
|
1502
|
+
return text
|
|
1503
|
+
|
|
1511
1504
|
|
|
1512
1505
|
class FormItem(FloatingItem):
|
|
1513
1506
|
"""FormItem."""
|
|
@@ -2297,7 +2290,7 @@ class DoclingDocument(BaseModel):
|
|
|
2297
2290
|
with_groups: bool = False,
|
|
2298
2291
|
traverse_pictures: bool = False,
|
|
2299
2292
|
page_no: Optional[int] = None,
|
|
2300
|
-
included_content_layers: set[ContentLayer] =
|
|
2293
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2301
2294
|
_level: int = 0, # fixed parameter, carries through the node nesting level
|
|
2302
2295
|
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
2303
2296
|
"""iterate_elements.
|
|
@@ -2310,6 +2303,11 @@ class DoclingDocument(BaseModel):
|
|
|
2310
2303
|
:param # fixed parameter:
|
|
2311
2304
|
:param carries through the node nesting level:
|
|
2312
2305
|
"""
|
|
2306
|
+
my_layers = (
|
|
2307
|
+
included_content_layers
|
|
2308
|
+
if included_content_layers is not None
|
|
2309
|
+
else DEFAULT_CONTENT_LAYERS
|
|
2310
|
+
)
|
|
2313
2311
|
if not root:
|
|
2314
2312
|
root = self.body
|
|
2315
2313
|
|
|
@@ -2325,7 +2323,7 @@ class DoclingDocument(BaseModel):
|
|
|
2325
2323
|
or any(prov.page_no == page_no for prov in root.prov)
|
|
2326
2324
|
)
|
|
2327
2325
|
)
|
|
2328
|
-
and root.content_layer in
|
|
2326
|
+
and root.content_layer in my_layers
|
|
2329
2327
|
)
|
|
2330
2328
|
|
|
2331
2329
|
if should_yield:
|
|
@@ -2345,7 +2343,7 @@ class DoclingDocument(BaseModel):
|
|
|
2345
2343
|
traverse_pictures=traverse_pictures,
|
|
2346
2344
|
page_no=page_no,
|
|
2347
2345
|
_level=_level + 1,
|
|
2348
|
-
included_content_layers=
|
|
2346
|
+
included_content_layers=my_layers,
|
|
2349
2347
|
)
|
|
2350
2348
|
|
|
2351
2349
|
def _clear_picture_pil_cache(self):
|
|
@@ -2475,12 +2473,14 @@ class DoclingDocument(BaseModel):
|
|
|
2475
2473
|
|
|
2476
2474
|
def save_as_json(
|
|
2477
2475
|
self,
|
|
2478
|
-
filename: Path,
|
|
2476
|
+
filename: Union[str, Path],
|
|
2479
2477
|
artifacts_dir: Optional[Path] = None,
|
|
2480
2478
|
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
2481
2479
|
indent: int = 2,
|
|
2482
2480
|
):
|
|
2483
2481
|
"""Save as json."""
|
|
2482
|
+
if isinstance(filename, str):
|
|
2483
|
+
filename = Path(filename)
|
|
2484
2484
|
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
2485
2485
|
|
|
2486
2486
|
if image_mode == ImageRefMode.REFERENCED:
|
|
@@ -2495,7 +2495,7 @@ class DoclingDocument(BaseModel):
|
|
|
2495
2495
|
json.dump(out, fw, indent=indent)
|
|
2496
2496
|
|
|
2497
2497
|
@classmethod
|
|
2498
|
-
def load_from_json(cls, filename: Path) -> "DoclingDocument":
|
|
2498
|
+
def load_from_json(cls, filename: Union[str, Path]) -> "DoclingDocument":
|
|
2499
2499
|
"""load_from_json.
|
|
2500
2500
|
|
|
2501
2501
|
:param filename: The filename to load a saved DoclingDocument from a .json.
|
|
@@ -2505,17 +2505,21 @@ class DoclingDocument(BaseModel):
|
|
|
2505
2505
|
:rtype: DoclingDocument
|
|
2506
2506
|
|
|
2507
2507
|
"""
|
|
2508
|
+
if isinstance(filename, str):
|
|
2509
|
+
filename = Path(filename)
|
|
2508
2510
|
with open(filename, "r", encoding="utf-8") as f:
|
|
2509
2511
|
return cls.model_validate_json(f.read())
|
|
2510
2512
|
|
|
2511
2513
|
def save_as_yaml(
|
|
2512
2514
|
self,
|
|
2513
|
-
filename: Path,
|
|
2515
|
+
filename: Union[str, Path],
|
|
2514
2516
|
artifacts_dir: Optional[Path] = None,
|
|
2515
2517
|
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
2516
2518
|
default_flow_style: bool = False,
|
|
2517
2519
|
):
|
|
2518
2520
|
"""Save as yaml."""
|
|
2521
|
+
if isinstance(filename, str):
|
|
2522
|
+
filename = Path(filename)
|
|
2519
2523
|
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
2520
2524
|
|
|
2521
2525
|
if image_mode == ImageRefMode.REFERENCED:
|
|
@@ -2530,7 +2534,7 @@ class DoclingDocument(BaseModel):
|
|
|
2530
2534
|
yaml.dump(out, fw, default_flow_style=default_flow_style)
|
|
2531
2535
|
|
|
2532
2536
|
@classmethod
|
|
2533
|
-
def load_from_yaml(cls, filename: Path) -> "DoclingDocument":
|
|
2537
|
+
def load_from_yaml(cls, filename: Union[str, Path]) -> "DoclingDocument":
|
|
2534
2538
|
"""load_from_yaml.
|
|
2535
2539
|
|
|
2536
2540
|
Args:
|
|
@@ -2539,6 +2543,8 @@ class DoclingDocument(BaseModel):
|
|
|
2539
2543
|
Returns:
|
|
2540
2544
|
DoclingDocument: the loaded DoclingDocument
|
|
2541
2545
|
"""
|
|
2546
|
+
if isinstance(filename, str):
|
|
2547
|
+
filename = Path(filename)
|
|
2542
2548
|
with open(filename, encoding="utf-8") as f:
|
|
2543
2549
|
data = yaml.load(f, Loader=yaml.FullLoader)
|
|
2544
2550
|
return DoclingDocument.model_validate(data)
|
|
@@ -2556,12 +2562,12 @@ class DoclingDocument(BaseModel):
|
|
|
2556
2562
|
|
|
2557
2563
|
def save_as_markdown(
|
|
2558
2564
|
self,
|
|
2559
|
-
filename: Path,
|
|
2565
|
+
filename: Union[str, Path],
|
|
2560
2566
|
artifacts_dir: Optional[Path] = None,
|
|
2561
2567
|
delim: str = "\n\n",
|
|
2562
2568
|
from_element: int = 0,
|
|
2563
2569
|
to_element: int = sys.maxsize,
|
|
2564
|
-
labels: set[DocItemLabel] =
|
|
2570
|
+
labels: Optional[set[DocItemLabel]] = None,
|
|
2565
2571
|
strict_text: bool = False,
|
|
2566
2572
|
escaping_underscores: bool = True,
|
|
2567
2573
|
image_placeholder: str = "<!-- image -->",
|
|
@@ -2569,9 +2575,12 @@ class DoclingDocument(BaseModel):
|
|
|
2569
2575
|
indent: int = 4,
|
|
2570
2576
|
text_width: int = -1,
|
|
2571
2577
|
page_no: Optional[int] = None,
|
|
2572
|
-
included_content_layers: set[ContentLayer] =
|
|
2578
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2579
|
+
page_break_placeholder: Optional[str] = None,
|
|
2573
2580
|
):
|
|
2574
2581
|
"""Save to markdown."""
|
|
2582
|
+
if isinstance(filename, str):
|
|
2583
|
+
filename = Path(filename)
|
|
2575
2584
|
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
2576
2585
|
|
|
2577
2586
|
if image_mode == ImageRefMode.REFERENCED:
|
|
@@ -2587,13 +2596,14 @@ class DoclingDocument(BaseModel):
|
|
|
2587
2596
|
to_element=to_element,
|
|
2588
2597
|
labels=labels,
|
|
2589
2598
|
strict_text=strict_text,
|
|
2590
|
-
|
|
2599
|
+
escape_underscores=escaping_underscores,
|
|
2591
2600
|
image_placeholder=image_placeholder,
|
|
2592
2601
|
image_mode=image_mode,
|
|
2593
2602
|
indent=indent,
|
|
2594
2603
|
text_width=text_width,
|
|
2595
2604
|
page_no=page_no,
|
|
2596
2605
|
included_content_layers=included_content_layers,
|
|
2606
|
+
page_break_placeholder=page_break_placeholder,
|
|
2597
2607
|
)
|
|
2598
2608
|
|
|
2599
2609
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
@@ -2604,15 +2614,16 @@ class DoclingDocument(BaseModel):
|
|
|
2604
2614
|
delim: str = "\n\n",
|
|
2605
2615
|
from_element: int = 0,
|
|
2606
2616
|
to_element: int = sys.maxsize,
|
|
2607
|
-
labels: set[DocItemLabel] =
|
|
2617
|
+
labels: Optional[set[DocItemLabel]] = None,
|
|
2608
2618
|
strict_text: bool = False,
|
|
2609
|
-
|
|
2619
|
+
escape_underscores: bool = True,
|
|
2610
2620
|
image_placeholder: str = "<!-- image -->",
|
|
2611
2621
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2612
2622
|
indent: int = 4,
|
|
2613
2623
|
text_width: int = -1,
|
|
2614
2624
|
page_no: Optional[int] = None,
|
|
2615
|
-
included_content_layers: set[ContentLayer] =
|
|
2625
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2626
|
+
page_break_placeholder: Optional[str] = None, # e.g. "<!-- page break -->",
|
|
2616
2627
|
) -> str:
|
|
2617
2628
|
r"""Serialize to Markdown.
|
|
2618
2629
|
|
|
@@ -2627,8 +2638,9 @@ class DoclingDocument(BaseModel):
|
|
|
2627
2638
|
:param to_element: Body slicing stop index
|
|
2628
2639
|
(exclusive). (Default value = maxint).
|
|
2629
2640
|
:type to_element: int = sys.maxsize
|
|
2630
|
-
:param labels: The set of document labels to include in the export.
|
|
2631
|
-
|
|
2641
|
+
:param labels: The set of document labels to include in the export. None falls
|
|
2642
|
+
back to the system-defined default.
|
|
2643
|
+
:type labels: Optional[set[DocItemLabel]] = None
|
|
2632
2644
|
:param strict_text: Deprecated.
|
|
2633
2645
|
:type strict_text: bool = False
|
|
2634
2646
|
:param escaping_underscores: bool: Whether to escape underscores in the
|
|
@@ -2643,30 +2655,40 @@ class DoclingDocument(BaseModel):
|
|
|
2643
2655
|
:param indent: The indent in spaces of the nested lists.
|
|
2644
2656
|
(Default value = 4).
|
|
2645
2657
|
:type indent: int = 4
|
|
2658
|
+
:param included_content_layers: The set of layels to include in the export. None
|
|
2659
|
+
falls back to the system-defined default.
|
|
2660
|
+
:type included_content_layers: Optional[set[ContentLayer]] = None
|
|
2661
|
+
:param page_break_placeholder: The placeholder to include for marking page
|
|
2662
|
+
breaks. None means no page break placeholder will be used.
|
|
2663
|
+
:type page_break_placeholder: Optional[str] = None
|
|
2646
2664
|
:returns: The exported Markdown representation.
|
|
2647
2665
|
:rtype: str
|
|
2648
2666
|
"""
|
|
2649
2667
|
from docling_core.experimental.serializer.markdown import (
|
|
2650
2668
|
MarkdownDocSerializer,
|
|
2651
|
-
|
|
2652
|
-
MarkdownTextSerializer,
|
|
2669
|
+
MarkdownParams,
|
|
2653
2670
|
)
|
|
2654
2671
|
|
|
2672
|
+
my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
|
|
2673
|
+
my_layers = (
|
|
2674
|
+
included_content_layers
|
|
2675
|
+
if included_content_layers is not None
|
|
2676
|
+
else DEFAULT_CONTENT_LAYERS
|
|
2677
|
+
)
|
|
2655
2678
|
serializer = MarkdownDocSerializer(
|
|
2656
2679
|
doc=self,
|
|
2657
|
-
|
|
2658
|
-
|
|
2659
|
-
|
|
2660
|
-
|
|
2661
|
-
|
|
2662
|
-
|
|
2663
|
-
|
|
2664
|
-
|
|
2665
|
-
|
|
2666
|
-
wrap_width=text_width if text_width > 0 else None,
|
|
2667
|
-
),
|
|
2668
|
-
list_serializer=MarkdownListSerializer(
|
|
2680
|
+
params=MarkdownParams(
|
|
2681
|
+
labels=my_labels,
|
|
2682
|
+
layers=my_layers,
|
|
2683
|
+
pages={page_no} if page_no is not None else None,
|
|
2684
|
+
start_idx=from_element,
|
|
2685
|
+
stop_idx=to_element,
|
|
2686
|
+
escape_underscores=escape_underscores,
|
|
2687
|
+
image_placeholder=image_placeholder,
|
|
2688
|
+
image_mode=image_mode,
|
|
2669
2689
|
indent=indent,
|
|
2690
|
+
wrap_width=text_width if text_width > 0 else None,
|
|
2691
|
+
page_break_placeholder=page_break_placeholder,
|
|
2670
2692
|
),
|
|
2671
2693
|
)
|
|
2672
2694
|
ser_res = serializer.serialize()
|
|
@@ -2687,34 +2709,38 @@ class DoclingDocument(BaseModel):
|
|
|
2687
2709
|
delim: str = "\n\n",
|
|
2688
2710
|
from_element: int = 0,
|
|
2689
2711
|
to_element: int = 1000000,
|
|
2690
|
-
labels: set[DocItemLabel] =
|
|
2712
|
+
labels: Optional[set[DocItemLabel]] = None,
|
|
2691
2713
|
) -> str:
|
|
2692
2714
|
"""export_to_text."""
|
|
2715
|
+
my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
|
|
2716
|
+
|
|
2693
2717
|
return self.export_to_markdown(
|
|
2694
|
-
delim,
|
|
2695
|
-
from_element,
|
|
2696
|
-
to_element,
|
|
2697
|
-
labels,
|
|
2718
|
+
delim=delim,
|
|
2719
|
+
from_element=from_element,
|
|
2720
|
+
to_element=to_element,
|
|
2721
|
+
labels=my_labels,
|
|
2698
2722
|
strict_text=True,
|
|
2699
|
-
|
|
2723
|
+
escape_underscores=False,
|
|
2700
2724
|
image_placeholder="",
|
|
2701
2725
|
)
|
|
2702
2726
|
|
|
2703
2727
|
def save_as_html(
|
|
2704
2728
|
self,
|
|
2705
|
-
filename: Path,
|
|
2729
|
+
filename: Union[str, Path],
|
|
2706
2730
|
artifacts_dir: Optional[Path] = None,
|
|
2707
2731
|
from_element: int = 0,
|
|
2708
2732
|
to_element: int = sys.maxsize,
|
|
2709
|
-
labels: set[DocItemLabel] =
|
|
2733
|
+
labels: Optional[set[DocItemLabel]] = None,
|
|
2710
2734
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2711
2735
|
formula_to_mathml: bool = True,
|
|
2712
2736
|
page_no: Optional[int] = None,
|
|
2713
2737
|
html_lang: str = "en",
|
|
2714
2738
|
html_head: str = _HTML_DEFAULT_HEAD,
|
|
2715
|
-
included_content_layers: set[ContentLayer] =
|
|
2739
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2716
2740
|
):
|
|
2717
2741
|
"""Save to HTML."""
|
|
2742
|
+
if isinstance(filename, str):
|
|
2743
|
+
filename = Path(filename)
|
|
2718
2744
|
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
2719
2745
|
|
|
2720
2746
|
if image_mode == ImageRefMode.REFERENCED:
|
|
@@ -2740,8 +2766,10 @@ class DoclingDocument(BaseModel):
|
|
|
2740
2766
|
fw.write(html_out)
|
|
2741
2767
|
|
|
2742
2768
|
def _get_output_paths(
|
|
2743
|
-
self, filename: Path, artifacts_dir: Optional[Path] = None
|
|
2769
|
+
self, filename: Union[str, Path], artifacts_dir: Optional[Path] = None
|
|
2744
2770
|
) -> Tuple[Path, Optional[Path]]:
|
|
2771
|
+
if isinstance(filename, str):
|
|
2772
|
+
filename = Path(filename)
|
|
2745
2773
|
if artifacts_dir is None:
|
|
2746
2774
|
# Remove the extension and add '_pictures'
|
|
2747
2775
|
artifacts_dir = filename.with_suffix("")
|
|
@@ -2775,15 +2803,21 @@ class DoclingDocument(BaseModel):
|
|
|
2775
2803
|
self,
|
|
2776
2804
|
from_element: int = 0,
|
|
2777
2805
|
to_element: int = sys.maxsize,
|
|
2778
|
-
labels: set[DocItemLabel] =
|
|
2806
|
+
labels: Optional[set[DocItemLabel]] = None,
|
|
2779
2807
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2780
2808
|
formula_to_mathml: bool = True,
|
|
2781
2809
|
page_no: Optional[int] = None,
|
|
2782
2810
|
html_lang: str = "en",
|
|
2783
2811
|
html_head: str = _HTML_DEFAULT_HEAD,
|
|
2784
|
-
included_content_layers: set[ContentLayer] =
|
|
2812
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2785
2813
|
) -> str:
|
|
2786
2814
|
r"""Serialize to HTML."""
|
|
2815
|
+
my_labels = labels if labels is not None else DEFAULT_EXPORT_LABELS
|
|
2816
|
+
my_layers = (
|
|
2817
|
+
included_content_layers
|
|
2818
|
+
if included_content_layers is not None
|
|
2819
|
+
else DEFAULT_CONTENT_LAYERS
|
|
2820
|
+
)
|
|
2787
2821
|
|
|
2788
2822
|
def close_lists(
|
|
2789
2823
|
curr_level: int,
|
|
@@ -2831,7 +2865,7 @@ class DoclingDocument(BaseModel):
|
|
|
2831
2865
|
self.body,
|
|
2832
2866
|
with_groups=True,
|
|
2833
2867
|
page_no=page_no,
|
|
2834
|
-
included_content_layers=
|
|
2868
|
+
included_content_layers=my_layers,
|
|
2835
2869
|
)
|
|
2836
2870
|
):
|
|
2837
2871
|
# If we've moved to a lower level, we're exiting one or more groups
|
|
@@ -2853,7 +2887,7 @@ class DoclingDocument(BaseModel):
|
|
|
2853
2887
|
if ix < from_element or to_element <= ix:
|
|
2854
2888
|
continue # skip as many items as you want
|
|
2855
2889
|
|
|
2856
|
-
if (isinstance(item, DocItem)) and (item.label not in
|
|
2890
|
+
if (isinstance(item, DocItem)) and (item.label not in my_labels):
|
|
2857
2891
|
continue # skip any label that is not whitelisted
|
|
2858
2892
|
|
|
2859
2893
|
if isinstance(item, GroupItem) and item.label in [
|
|
@@ -3000,7 +3034,7 @@ class DoclingDocument(BaseModel):
|
|
|
3000
3034
|
)
|
|
3001
3035
|
)
|
|
3002
3036
|
|
|
3003
|
-
elif isinstance(item, DocItem) and item.label in
|
|
3037
|
+
elif isinstance(item, DocItem) and item.label in my_labels:
|
|
3004
3038
|
continue
|
|
3005
3039
|
|
|
3006
3040
|
html_texts.append("</html>")
|
|
@@ -3037,6 +3071,7 @@ class DoclingDocument(BaseModel):
|
|
|
3037
3071
|
"list_item": DocItemLabel.LIST_ITEM,
|
|
3038
3072
|
"footnote": DocItemLabel.FOOTNOTE,
|
|
3039
3073
|
"code": DocItemLabel.CODE,
|
|
3074
|
+
"key_value_region": DocItemLabel.KEY_VALUE_REGION,
|
|
3040
3075
|
}
|
|
3041
3076
|
|
|
3042
3077
|
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
|
@@ -3189,7 +3224,7 @@ class DoclingDocument(BaseModel):
|
|
|
3189
3224
|
token
|
|
3190
3225
|
for token in tokens
|
|
3191
3226
|
if not (
|
|
3192
|
-
token.startswith(rf"<{
|
|
3227
|
+
token.startswith(rf"<{_LOC_PREFIX}")
|
|
3193
3228
|
or token
|
|
3194
3229
|
in [
|
|
3195
3230
|
rf"<{DocumentToken.OTSL.value}>",
|
|
@@ -3203,7 +3238,7 @@ class DoclingDocument(BaseModel):
|
|
|
3203
3238
|
token
|
|
3204
3239
|
for token in text_parts
|
|
3205
3240
|
if not (
|
|
3206
|
-
token.startswith(rf"<{
|
|
3241
|
+
token.startswith(rf"<{_LOC_PREFIX}")
|
|
3207
3242
|
or token
|
|
3208
3243
|
in [
|
|
3209
3244
|
rf"<{DocumentToken.OTSL.value}>",
|
|
@@ -3228,6 +3263,95 @@ class DoclingDocument(BaseModel):
|
|
|
3228
3263
|
table_cells=table_cells,
|
|
3229
3264
|
)
|
|
3230
3265
|
|
|
3266
|
+
def parse_key_value_item(
|
|
3267
|
+
tokens: str, image: Optional[PILImage.Image] = None
|
|
3268
|
+
) -> Tuple[GraphData, Optional[ProvenanceItem]]:
|
|
3269
|
+
if image is not None:
|
|
3270
|
+
pg_width = image.width
|
|
3271
|
+
pg_height = image.height
|
|
3272
|
+
else:
|
|
3273
|
+
pg_width = 1
|
|
3274
|
+
pg_height = 1
|
|
3275
|
+
|
|
3276
|
+
start_locs_match = re.search(r"<key_value_region>(.*?)<key", tokens)
|
|
3277
|
+
if start_locs_match:
|
|
3278
|
+
overall_locs = start_locs_match.group(1)
|
|
3279
|
+
overall_bbox = extract_bounding_box(overall_locs) if image else None
|
|
3280
|
+
overall_prov = (
|
|
3281
|
+
ProvenanceItem(
|
|
3282
|
+
bbox=overall_bbox.resize_by_scale(pg_width, pg_height),
|
|
3283
|
+
charspan=(0, 0),
|
|
3284
|
+
page_no=1,
|
|
3285
|
+
)
|
|
3286
|
+
if overall_bbox
|
|
3287
|
+
else None
|
|
3288
|
+
)
|
|
3289
|
+
else:
|
|
3290
|
+
overall_prov = None
|
|
3291
|
+
|
|
3292
|
+
# here we assumed the labels as only key or value, later on we can update
|
|
3293
|
+
# it to have unspecified, checkbox etc.
|
|
3294
|
+
cell_pattern = re.compile(
|
|
3295
|
+
r"<(?P<label>key|value)_(?P<id>\d+)>"
|
|
3296
|
+
r"(?P<content>.*?)"
|
|
3297
|
+
r"</(?P=label)_(?P=id)>",
|
|
3298
|
+
re.DOTALL,
|
|
3299
|
+
)
|
|
3300
|
+
|
|
3301
|
+
cells: List["GraphCell"] = []
|
|
3302
|
+
links: List["GraphLink"] = []
|
|
3303
|
+
raw_link_predictions = []
|
|
3304
|
+
|
|
3305
|
+
for cell_match in cell_pattern.finditer(tokens):
|
|
3306
|
+
cell_label_str = cell_match.group("label") # "key" or "value"
|
|
3307
|
+
cell_id = int(cell_match.group("id"))
|
|
3308
|
+
raw_content = cell_match.group("content")
|
|
3309
|
+
|
|
3310
|
+
# link tokens
|
|
3311
|
+
link_matches = re.findall(r"<link_(\d+)>", raw_content)
|
|
3312
|
+
|
|
3313
|
+
cell_bbox = extract_bounding_box(raw_content) if image else None
|
|
3314
|
+
cell_prov = None
|
|
3315
|
+
if cell_bbox is not None:
|
|
3316
|
+
cell_prov = ProvenanceItem(
|
|
3317
|
+
bbox=cell_bbox.resize_by_scale(pg_width, pg_height),
|
|
3318
|
+
charspan=(0, 0),
|
|
3319
|
+
page_no=1,
|
|
3320
|
+
)
|
|
3321
|
+
|
|
3322
|
+
cleaned_text = re.sub(r"<loc_\d+>", "", raw_content)
|
|
3323
|
+
cleaned_text = re.sub(r"<link_\d+>", "", cleaned_text).strip()
|
|
3324
|
+
|
|
3325
|
+
cell_obj = GraphCell(
|
|
3326
|
+
label=GraphCellLabel(cell_label_str),
|
|
3327
|
+
cell_id=cell_id,
|
|
3328
|
+
text=cleaned_text,
|
|
3329
|
+
orig=cleaned_text,
|
|
3330
|
+
prov=cell_prov,
|
|
3331
|
+
item_ref=None,
|
|
3332
|
+
)
|
|
3333
|
+
cells.append(cell_obj)
|
|
3334
|
+
|
|
3335
|
+
cell_ids = {cell.cell_id for cell in cells}
|
|
3336
|
+
|
|
3337
|
+
for target_str in link_matches:
|
|
3338
|
+
raw_link_predictions.append((cell_id, int(target_str)))
|
|
3339
|
+
|
|
3340
|
+
cell_ids = {cell.cell_id for cell in cells}
|
|
3341
|
+
|
|
3342
|
+
for source_id, target_id in raw_link_predictions:
|
|
3343
|
+
# basic check to validate the prediction
|
|
3344
|
+
if target_id not in cell_ids:
|
|
3345
|
+
continue
|
|
3346
|
+
link_obj = GraphLink(
|
|
3347
|
+
label=GraphLinkLabel.TO_VALUE,
|
|
3348
|
+
source_cell_id=source_id,
|
|
3349
|
+
target_cell_id=target_id,
|
|
3350
|
+
)
|
|
3351
|
+
links.append(link_obj)
|
|
3352
|
+
|
|
3353
|
+
return (GraphData(cells=cells, links=links), overall_prov)
|
|
3354
|
+
|
|
3231
3355
|
# doc = DoclingDocument(name="Document")
|
|
3232
3356
|
for pg_idx, doctag_page in enumerate(doctag_document.pages):
|
|
3233
3357
|
page_doctags = doctag_page.tokens
|
|
@@ -3243,6 +3367,12 @@ class DoclingDocument(BaseModel):
|
|
|
3243
3367
|
pg_width = 1
|
|
3244
3368
|
pg_height = 1
|
|
3245
3369
|
|
|
3370
|
+
self.add_page(
|
|
3371
|
+
page_no=page_no,
|
|
3372
|
+
size=Size(width=pg_width, height=pg_height),
|
|
3373
|
+
image=ImageRef.from_pil(image=image, dpi=72) if image else None,
|
|
3374
|
+
)
|
|
3375
|
+
|
|
3246
3376
|
"""
|
|
3247
3377
|
1. Finds all <tag>...</tag>
|
|
3248
3378
|
blocks in the entire string (multi-line friendly)
|
|
@@ -3263,6 +3393,7 @@ class DoclingDocument(BaseModel):
|
|
|
3263
3393
|
rf"{DocItemLabel.SECTION_HEADER}_level_1|"
|
|
3264
3394
|
rf"{DocumentToken.ORDERED_LIST.value}|"
|
|
3265
3395
|
rf"{DocumentToken.UNORDERED_LIST.value}|"
|
|
3396
|
+
rf"{DocItemLabel.KEY_VALUE_REGION}|"
|
|
3266
3397
|
rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
|
|
3267
3398
|
)
|
|
3268
3399
|
|
|
@@ -3348,6 +3479,11 @@ class DoclingDocument(BaseModel):
|
|
|
3348
3479
|
parent=None,
|
|
3349
3480
|
)
|
|
3350
3481
|
pic.captions.append(caption_item.get_ref())
|
|
3482
|
+
elif tag_name == DocItemLabel.KEY_VALUE_REGION:
|
|
3483
|
+
key_value_data, kv_item_prov = parse_key_value_item(
|
|
3484
|
+
full_chunk, image
|
|
3485
|
+
)
|
|
3486
|
+
self.add_key_values(graph=key_value_data, prov=kv_item_prov)
|
|
3351
3487
|
elif tag_name in [
|
|
3352
3488
|
DocumentToken.ORDERED_LIST.value,
|
|
3353
3489
|
DocumentToken.UNORDERED_LIST.value,
|
|
@@ -3392,18 +3528,25 @@ class DoclingDocument(BaseModel):
|
|
|
3392
3528
|
else:
|
|
3393
3529
|
# For everything else, treat as text
|
|
3394
3530
|
text_content = extract_inner_text(full_chunk)
|
|
3531
|
+
element_prov = (
|
|
3532
|
+
ProvenanceItem(
|
|
3533
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3534
|
+
charspan=(0, len(text_content)),
|
|
3535
|
+
page_no=page_no,
|
|
3536
|
+
)
|
|
3537
|
+
if bbox
|
|
3538
|
+
else None
|
|
3539
|
+
)
|
|
3540
|
+
|
|
3541
|
+
content_layer = ContentLayer.BODY
|
|
3542
|
+
if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
|
3543
|
+
content_layer = ContentLayer.FURNITURE
|
|
3544
|
+
|
|
3395
3545
|
self.add_text(
|
|
3396
3546
|
label=doc_label,
|
|
3397
3547
|
text=text_content,
|
|
3398
|
-
prov=
|
|
3399
|
-
|
|
3400
|
-
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
3401
|
-
charspan=(0, len(text_content)),
|
|
3402
|
-
page_no=page_no,
|
|
3403
|
-
)
|
|
3404
|
-
if bbox
|
|
3405
|
-
else None
|
|
3406
|
-
),
|
|
3548
|
+
prov=element_prov,
|
|
3549
|
+
content_layer=content_layer,
|
|
3407
3550
|
)
|
|
3408
3551
|
return self
|
|
3409
3552
|
|
|
@@ -3414,11 +3557,11 @@ class DoclingDocument(BaseModel):
|
|
|
3414
3557
|
|
|
3415
3558
|
def save_as_doctags(
|
|
3416
3559
|
self,
|
|
3417
|
-
filename: Path,
|
|
3560
|
+
filename: Union[str, Path],
|
|
3418
3561
|
delim: str = "",
|
|
3419
3562
|
from_element: int = 0,
|
|
3420
3563
|
to_element: int = sys.maxsize,
|
|
3421
|
-
labels: set[DocItemLabel] =
|
|
3564
|
+
labels: Optional[set[DocItemLabel]] = None,
|
|
3422
3565
|
xsize: int = 500,
|
|
3423
3566
|
ysize: int = 500,
|
|
3424
3567
|
add_location: bool = True,
|
|
@@ -3427,9 +3570,12 @@ class DoclingDocument(BaseModel):
|
|
|
3427
3570
|
# table specific flags
|
|
3428
3571
|
add_table_cell_location: bool = False,
|
|
3429
3572
|
add_table_cell_text: bool = True,
|
|
3573
|
+
minified: bool = False,
|
|
3430
3574
|
):
|
|
3431
3575
|
r"""Save the document content to DocTags format."""
|
|
3432
|
-
|
|
3576
|
+
if isinstance(filename, str):
|
|
3577
|
+
filename = Path(filename)
|
|
3578
|
+
out = self.export_to_doctags(
|
|
3433
3579
|
delim=delim,
|
|
3434
3580
|
from_element=from_element,
|
|
3435
3581
|
to_element=to_element,
|
|
@@ -3442,17 +3588,23 @@ class DoclingDocument(BaseModel):
|
|
|
3442
3588
|
# table specific flags
|
|
3443
3589
|
add_table_cell_location=add_table_cell_location,
|
|
3444
3590
|
add_table_cell_text=add_table_cell_text,
|
|
3591
|
+
minified=minified,
|
|
3445
3592
|
)
|
|
3446
3593
|
|
|
3447
3594
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
3448
3595
|
fw.write(out)
|
|
3449
3596
|
|
|
3450
|
-
|
|
3597
|
+
@deprecated("Use export_to_doctags() instead.")
|
|
3598
|
+
def export_to_document_tokens(self, *args, **kwargs):
|
|
3599
|
+
r"""Export to DocTags format."""
|
|
3600
|
+
return self.export_to_doctags(*args, **kwargs)
|
|
3601
|
+
|
|
3602
|
+
def export_to_doctags( # noqa: C901
|
|
3451
3603
|
self,
|
|
3452
|
-
delim: str = "",
|
|
3604
|
+
delim: str = "", # deprecated
|
|
3453
3605
|
from_element: int = 0,
|
|
3454
3606
|
to_element: int = sys.maxsize,
|
|
3455
|
-
labels: set[DocItemLabel] =
|
|
3607
|
+
labels: Optional[set[DocItemLabel]] = None,
|
|
3456
3608
|
xsize: int = 500,
|
|
3457
3609
|
ysize: int = 500,
|
|
3458
3610
|
add_location: bool = True,
|
|
@@ -3461,13 +3613,14 @@ class DoclingDocument(BaseModel):
|
|
|
3461
3613
|
# table specific flags
|
|
3462
3614
|
add_table_cell_location: bool = False,
|
|
3463
3615
|
add_table_cell_text: bool = True,
|
|
3616
|
+
minified: bool = False,
|
|
3464
3617
|
) -> str:
|
|
3465
3618
|
r"""Exports the document content to a DocumentToken format.
|
|
3466
3619
|
|
|
3467
3620
|
Operates on a slice of the document's body as defined through arguments
|
|
3468
3621
|
from_element and to_element; defaulting to the whole main_text.
|
|
3469
3622
|
|
|
3470
|
-
:param delim: str: (Default value = "")
|
|
3623
|
+
:param delim: str: (Default value = "") Deprecated
|
|
3471
3624
|
:param from_element: int: (Default value = 0)
|
|
3472
3625
|
:param to_element: Optional[int]: (Default value = None)
|
|
3473
3626
|
:param labels: set[DocItemLabel]
|
|
@@ -3478,199 +3631,40 @@ class DoclingDocument(BaseModel):
|
|
|
3478
3631
|
:param add_page_index: bool: (Default value = True)
|
|
3479
3632
|
:param # table specific flagsadd_table_cell_location: bool
|
|
3480
3633
|
:param add_table_cell_text: bool: (Default value = True)
|
|
3634
|
+
:param minified: bool: (Default value = False)
|
|
3481
3635
|
:returns: The content of the document formatted as a DocTags string.
|
|
3482
3636
|
:rtype: str
|
|
3483
3637
|
"""
|
|
3484
|
-
|
|
3485
|
-
|
|
3486
|
-
|
|
3487
|
-
previous_level: int,
|
|
3488
|
-
ordered_list_stack: List[bool],
|
|
3489
|
-
output_parts: List[str],
|
|
3490
|
-
) -> List[bool]:
|
|
3491
|
-
"""Close open list tags until the nesting level matches item's level."""
|
|
3492
|
-
while current_level < previous_level and ordered_list_stack:
|
|
3493
|
-
last_is_ordered = ordered_list_stack.pop()
|
|
3494
|
-
if last_is_ordered:
|
|
3495
|
-
output_parts.append(f"</{DocumentToken.ORDERED_LIST.value}>\n")
|
|
3496
|
-
else:
|
|
3497
|
-
output_parts.append(f"</{DocumentToken.UNORDERED_LIST.value}>\n")
|
|
3498
|
-
previous_level -= 1
|
|
3499
|
-
return ordered_list_stack
|
|
3500
|
-
|
|
3501
|
-
def _add_page_break_if_needed(
|
|
3502
|
-
output_parts: List[str],
|
|
3503
|
-
item,
|
|
3504
|
-
prev_page_no,
|
|
3505
|
-
page_break_enabled: bool,
|
|
3506
|
-
):
|
|
3507
|
-
"""Inserts a page-break token.
|
|
3508
|
-
|
|
3509
|
-
Inserts a page-break token if the item's page number is different
|
|
3510
|
-
from the previous item and page breaks are enabled.
|
|
3511
|
-
Returns the updated output_parts list and the current page number.
|
|
3512
|
-
"""
|
|
3513
|
-
if not page_break_enabled:
|
|
3514
|
-
return output_parts, prev_page_no
|
|
3515
|
-
|
|
3516
|
-
if not item.prov:
|
|
3517
|
-
return output_parts, prev_page_no
|
|
3518
|
-
|
|
3519
|
-
current_page_no = item.prov[0].page_no
|
|
3520
|
-
if prev_page_no is None:
|
|
3521
|
-
return output_parts, current_page_no
|
|
3522
|
-
|
|
3523
|
-
if current_page_no != prev_page_no:
|
|
3524
|
-
output_parts.append(f"<{DocumentToken.PAGE_BREAK.value}>\n")
|
|
3525
|
-
|
|
3526
|
-
return output_parts, current_page_no
|
|
3527
|
-
|
|
3528
|
-
def _get_standalone_captions(document_body):
|
|
3529
|
-
"""Identify captions that are not attached to any table or figure."""
|
|
3530
|
-
all_captions = set()
|
|
3531
|
-
matched_captions = set()
|
|
3532
|
-
for item, _ in self.iterate_items(document_body, with_groups=True):
|
|
3533
|
-
if item.label == DocItemLabel.CAPTION:
|
|
3534
|
-
all_captions.update([item.self_ref])
|
|
3535
|
-
if item.label in [DocItemLabel.PICTURE, DocItemLabel.TABLE]:
|
|
3536
|
-
matched_captions.update([caption.cref for caption in item.captions])
|
|
3537
|
-
|
|
3538
|
-
return all_captions - matched_captions
|
|
3539
|
-
|
|
3540
|
-
# Initialization
|
|
3541
|
-
output_parts: List[str] = []
|
|
3542
|
-
ordered_list_stack: List[bool] = []
|
|
3543
|
-
previous_level = 0
|
|
3544
|
-
previous_page_no = None
|
|
3545
|
-
|
|
3546
|
-
# Precompute standalone captions
|
|
3547
|
-
standalone_captions = _get_standalone_captions(self.body)
|
|
3548
|
-
|
|
3549
|
-
# Begin document
|
|
3550
|
-
output_parts.append(f"<{DocumentToken.DOCUMENT.value}>{delim}")
|
|
3551
|
-
|
|
3552
|
-
for ix, (item, current_level) in enumerate(
|
|
3553
|
-
self.iterate_items(
|
|
3554
|
-
self.body,
|
|
3555
|
-
with_groups=True,
|
|
3556
|
-
included_content_layers={
|
|
3557
|
-
ContentLayer.BODY,
|
|
3558
|
-
ContentLayer.FURNITURE,
|
|
3559
|
-
},
|
|
3560
|
-
)
|
|
3561
|
-
):
|
|
3562
|
-
# Close lists if we've moved to a lower nesting level
|
|
3563
|
-
if current_level < previous_level and ordered_list_stack:
|
|
3564
|
-
ordered_list_stack = _close_lists(
|
|
3565
|
-
current_level,
|
|
3566
|
-
previous_level,
|
|
3567
|
-
ordered_list_stack,
|
|
3568
|
-
output_parts,
|
|
3569
|
-
)
|
|
3570
|
-
previous_level = current_level
|
|
3571
|
-
|
|
3572
|
-
# Skip items outside the specified element range
|
|
3573
|
-
if ix < from_element or ix >= to_element:
|
|
3574
|
-
continue
|
|
3575
|
-
|
|
3576
|
-
# Skip items whose label is not in the allowed set
|
|
3577
|
-
if isinstance(item, DocItem) and (item.label not in labels):
|
|
3578
|
-
continue
|
|
3579
|
-
|
|
3580
|
-
# Skip captions that are not standalone as they will be included below
|
|
3581
|
-
# by the export functions of Table and Picture
|
|
3582
|
-
if (
|
|
3583
|
-
isinstance(item, TextItem)
|
|
3584
|
-
and item.label == DocItemLabel.CAPTION
|
|
3585
|
-
and item.self_ref not in standalone_captions
|
|
3586
|
-
):
|
|
3587
|
-
continue
|
|
3588
|
-
|
|
3589
|
-
# Handle list groups
|
|
3590
|
-
if isinstance(item, GroupItem):
|
|
3591
|
-
if item.label == GroupLabel.ORDERED_LIST:
|
|
3592
|
-
output_parts.append(f"<{DocumentToken.ORDERED_LIST.value}>{delim}")
|
|
3593
|
-
ordered_list_stack.append(True)
|
|
3594
|
-
elif item.label == GroupLabel.LIST:
|
|
3595
|
-
output_parts.append(
|
|
3596
|
-
f"<{DocumentToken.UNORDERED_LIST.value}>{delim}"
|
|
3597
|
-
)
|
|
3598
|
-
ordered_list_stack.append(False)
|
|
3599
|
-
continue
|
|
3600
|
-
|
|
3601
|
-
# For other item types, optionally insert page-break if the page changed
|
|
3602
|
-
output_parts, previous_page_no = _add_page_break_if_needed(
|
|
3603
|
-
output_parts, item, previous_page_no, add_page_index
|
|
3604
|
-
)
|
|
3605
|
-
|
|
3606
|
-
if isinstance(item, SectionHeaderItem):
|
|
3607
|
-
output_parts.append(
|
|
3608
|
-
item.export_to_document_tokens(
|
|
3609
|
-
doc=self,
|
|
3610
|
-
new_line=delim,
|
|
3611
|
-
xsize=xsize,
|
|
3612
|
-
ysize=ysize,
|
|
3613
|
-
add_location=add_location,
|
|
3614
|
-
add_content=add_content,
|
|
3615
|
-
)
|
|
3616
|
-
)
|
|
3617
|
-
elif isinstance(item, CodeItem):
|
|
3618
|
-
output_parts.append(
|
|
3619
|
-
item.export_to_document_tokens(
|
|
3620
|
-
doc=self,
|
|
3621
|
-
new_line=delim,
|
|
3622
|
-
xsize=xsize,
|
|
3623
|
-
ysize=ysize,
|
|
3624
|
-
add_location=add_location,
|
|
3625
|
-
add_content=add_content,
|
|
3626
|
-
)
|
|
3627
|
-
)
|
|
3628
|
-
elif isinstance(item, TextItem):
|
|
3629
|
-
output_parts.append(
|
|
3630
|
-
item.export_to_document_tokens(
|
|
3631
|
-
doc=self,
|
|
3632
|
-
new_line=delim,
|
|
3633
|
-
xsize=xsize,
|
|
3634
|
-
ysize=ysize,
|
|
3635
|
-
add_location=add_location,
|
|
3636
|
-
add_content=add_content,
|
|
3637
|
-
)
|
|
3638
|
-
)
|
|
3639
|
-
elif isinstance(item, TableItem):
|
|
3640
|
-
output_parts.append(
|
|
3641
|
-
item.export_to_document_tokens(
|
|
3642
|
-
doc=self,
|
|
3643
|
-
new_line=delim,
|
|
3644
|
-
xsize=xsize,
|
|
3645
|
-
ysize=ysize,
|
|
3646
|
-
add_location=add_location,
|
|
3647
|
-
add_cell_location=add_table_cell_location,
|
|
3648
|
-
add_cell_text=add_table_cell_text,
|
|
3649
|
-
add_caption=True,
|
|
3650
|
-
)
|
|
3651
|
-
)
|
|
3652
|
-
elif isinstance(item, PictureItem):
|
|
3653
|
-
output_parts.append(
|
|
3654
|
-
item.export_to_document_tokens(
|
|
3655
|
-
doc=self,
|
|
3656
|
-
new_line=delim,
|
|
3657
|
-
xsize=xsize,
|
|
3658
|
-
ysize=ysize,
|
|
3659
|
-
add_caption=True,
|
|
3660
|
-
add_location=add_location,
|
|
3661
|
-
add_content=add_content,
|
|
3662
|
-
)
|
|
3663
|
-
)
|
|
3664
|
-
|
|
3665
|
-
# End any lists that might still be open
|
|
3666
|
-
ordered_list_stack = _close_lists(
|
|
3667
|
-
0, previous_level, ordered_list_stack, output_parts
|
|
3638
|
+
from docling_core.experimental.serializer.doctags import (
|
|
3639
|
+
DocTagsDocSerializer,
|
|
3640
|
+
DocTagsParams,
|
|
3668
3641
|
)
|
|
3669
3642
|
|
|
3670
|
-
|
|
3671
|
-
|
|
3672
|
-
|
|
3673
|
-
|
|
3643
|
+
my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
|
|
3644
|
+
serializer = DocTagsDocSerializer(
|
|
3645
|
+
doc=self,
|
|
3646
|
+
params=DocTagsParams(
|
|
3647
|
+
labels=my_labels,
|
|
3648
|
+
# layers=..., # not exposed
|
|
3649
|
+
start_idx=from_element,
|
|
3650
|
+
stop_idx=to_element,
|
|
3651
|
+
xsize=xsize,
|
|
3652
|
+
ysize=ysize,
|
|
3653
|
+
add_location=add_location,
|
|
3654
|
+
# add_caption=..., # not exposed
|
|
3655
|
+
add_content=add_content,
|
|
3656
|
+
add_page_break=add_page_index,
|
|
3657
|
+
add_table_cell_location=add_table_cell_location,
|
|
3658
|
+
add_table_cell_text=add_table_cell_text,
|
|
3659
|
+
mode=(
|
|
3660
|
+
DocTagsParams.Mode.MINIFIED
|
|
3661
|
+
if minified
|
|
3662
|
+
else DocTagsParams.Mode.HUMAN_FRIENDLY
|
|
3663
|
+
),
|
|
3664
|
+
),
|
|
3665
|
+
)
|
|
3666
|
+
ser_res = serializer.serialize()
|
|
3667
|
+
return ser_res.text
|
|
3674
3668
|
|
|
3675
3669
|
def _export_to_indented_text(
|
|
3676
3670
|
self,
|