docling-core 2.18.1__py3-none-any.whl → 2.19.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/hierarchical_chunker.py +5 -2
- docling_core/types/doc/document.py +333 -295
- docling_core/types/doc/labels.py +1 -1
- docling_core/types/doc/tokens.py +20 -94
- {docling_core-2.18.1.dist-info → docling_core-2.19.1.dist-info}/METADATA +1 -1
- {docling_core-2.18.1.dist-info → docling_core-2.19.1.dist-info}/RECORD +9 -9
- {docling_core-2.18.1.dist-info → docling_core-2.19.1.dist-info}/LICENSE +0 -0
- {docling_core-2.18.1.dist-info → docling_core-2.19.1.dist-info}/WHEEL +0 -0
- {docling_core-2.18.1.dist-info → docling_core-2.19.1.dist-info}/entry_points.txt +0 -0
|
@@ -19,6 +19,7 @@ from docling_core.search.package import VERSION_PATTERN
|
|
|
19
19
|
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
|
|
20
20
|
from docling_core.types import DoclingDocument as DLDocument
|
|
21
21
|
from docling_core.types.doc.document import (
|
|
22
|
+
CodeItem,
|
|
22
23
|
DocItem,
|
|
23
24
|
DocumentOrigin,
|
|
24
25
|
LevelNumber,
|
|
@@ -199,8 +200,10 @@ class HierarchicalChunker(BaseChunker):
|
|
|
199
200
|
heading_by_level.pop(k, None)
|
|
200
201
|
continue
|
|
201
202
|
|
|
202
|
-
if
|
|
203
|
-
|
|
203
|
+
if (
|
|
204
|
+
isinstance(item, TextItem)
|
|
205
|
+
or ((not self.merge_list_items) and isinstance(item, ListItem))
|
|
206
|
+
or isinstance(item, CodeItem)
|
|
204
207
|
):
|
|
205
208
|
text = item.text
|
|
206
209
|
elif isinstance(item, TableItem):
|
|
@@ -75,6 +75,14 @@ DEFAULT_EXPORT_LABELS = {
|
|
|
75
75
|
DocItemLabel.PAGE_FOOTER,
|
|
76
76
|
}
|
|
77
77
|
|
|
78
|
+
DOCUMENT_TOKENS_EXPORT_LABELS = DEFAULT_EXPORT_LABELS.copy()
|
|
79
|
+
DOCUMENT_TOKENS_EXPORT_LABELS.update(
|
|
80
|
+
[
|
|
81
|
+
DocItemLabel.FOOTNOTE,
|
|
82
|
+
DocItemLabel.CAPTION,
|
|
83
|
+
]
|
|
84
|
+
)
|
|
85
|
+
|
|
78
86
|
|
|
79
87
|
class BasePictureData(BaseModel):
|
|
80
88
|
"""BasePictureData."""
|
|
@@ -564,9 +572,8 @@ class DocItem(
|
|
|
564
572
|
self,
|
|
565
573
|
doc: "DoclingDocument",
|
|
566
574
|
new_line: str,
|
|
567
|
-
xsize: int =
|
|
568
|
-
ysize: int =
|
|
569
|
-
add_page_index: bool = True,
|
|
575
|
+
xsize: int = 500,
|
|
576
|
+
ysize: int = 500,
|
|
570
577
|
) -> str:
|
|
571
578
|
"""Get the location string for the BaseCell."""
|
|
572
579
|
if not len(self.prov):
|
|
@@ -576,17 +583,12 @@ class DocItem(
|
|
|
576
583
|
for prov in self.prov:
|
|
577
584
|
page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
|
|
578
585
|
|
|
579
|
-
page_i = -1
|
|
580
|
-
if add_page_index:
|
|
581
|
-
page_i = prov.page_no
|
|
582
|
-
|
|
583
586
|
loc_str = DocumentToken.get_location(
|
|
584
|
-
bbox=prov.bbox.
|
|
587
|
+
bbox=prov.bbox.to_top_left_origin(page_h).as_tuple(),
|
|
585
588
|
page_w=page_w,
|
|
586
589
|
page_h=page_h,
|
|
587
590
|
xsize=xsize,
|
|
588
591
|
ysize=ysize,
|
|
589
|
-
page_i=page_i,
|
|
590
592
|
)
|
|
591
593
|
location += f"{loc_str}{new_line}"
|
|
592
594
|
|
|
@@ -641,57 +643,40 @@ class TextItem(DocItem):
|
|
|
641
643
|
def export_to_document_tokens(
|
|
642
644
|
self,
|
|
643
645
|
doc: "DoclingDocument",
|
|
644
|
-
new_line: str = "
|
|
645
|
-
xsize: int =
|
|
646
|
-
ysize: int =
|
|
646
|
+
new_line: str = "",
|
|
647
|
+
xsize: int = 500,
|
|
648
|
+
ysize: int = 500,
|
|
647
649
|
add_location: bool = True,
|
|
648
650
|
add_content: bool = True,
|
|
649
|
-
add_page_index: bool = True,
|
|
650
651
|
):
|
|
651
652
|
r"""Export text element to document tokens format.
|
|
652
653
|
|
|
653
654
|
:param doc: "DoclingDocument":
|
|
654
|
-
:param new_line: str
|
|
655
|
-
:param xsize: int: (Default value =
|
|
656
|
-
:param ysize: int: (Default value =
|
|
655
|
+
:param new_line: str (Default value = "")
|
|
656
|
+
:param xsize: int: (Default value = 500)
|
|
657
|
+
:param ysize: int: (Default value = 500)
|
|
657
658
|
:param add_location: bool: (Default value = True)
|
|
658
659
|
:param add_content: bool: (Default value = True)
|
|
659
|
-
:param add_page_index: bool: (Default value = True)
|
|
660
660
|
|
|
661
661
|
"""
|
|
662
|
-
body = f"<{self.label.value}>"
|
|
663
|
-
|
|
664
|
-
# TODO: This must be done through an explicit mapping.
|
|
665
|
-
# assert DocumentToken.is_known_token(
|
|
666
|
-
# body
|
|
667
|
-
# ), f"failed DocumentToken.is_known_token({body})"
|
|
662
|
+
body = f"<{self.label.value}>{new_line}"
|
|
668
663
|
|
|
669
664
|
if add_location:
|
|
670
665
|
body += self.get_location_tokens(
|
|
671
666
|
doc=doc,
|
|
672
|
-
new_line=
|
|
667
|
+
new_line=new_line,
|
|
673
668
|
xsize=xsize,
|
|
674
669
|
ysize=ysize,
|
|
675
|
-
add_page_index=add_page_index,
|
|
676
670
|
)
|
|
677
671
|
|
|
678
672
|
if add_content and self.text is not None:
|
|
679
|
-
body += self.text.strip()
|
|
673
|
+
body += f"{self.text.strip()}{new_line}"
|
|
680
674
|
|
|
681
|
-
body += f"</{self.label.value}
|
|
675
|
+
body += f"</{self.label.value}>\n"
|
|
682
676
|
|
|
683
677
|
return body
|
|
684
678
|
|
|
685
679
|
|
|
686
|
-
class CodeItem(TextItem):
|
|
687
|
-
"""CodeItem."""
|
|
688
|
-
|
|
689
|
-
label: typing.Literal[DocItemLabel.CODE] = (
|
|
690
|
-
DocItemLabel.CODE # type: ignore[assignment]
|
|
691
|
-
)
|
|
692
|
-
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
|
|
693
|
-
|
|
694
|
-
|
|
695
680
|
class SectionHeaderItem(TextItem):
|
|
696
681
|
"""SectionItem."""
|
|
697
682
|
|
|
@@ -703,25 +688,23 @@ class SectionHeaderItem(TextItem):
|
|
|
703
688
|
def export_to_document_tokens(
|
|
704
689
|
self,
|
|
705
690
|
doc: "DoclingDocument",
|
|
706
|
-
new_line: str = "
|
|
707
|
-
xsize: int =
|
|
708
|
-
ysize: int =
|
|
691
|
+
new_line: str = "",
|
|
692
|
+
xsize: int = 500,
|
|
693
|
+
ysize: int = 500,
|
|
709
694
|
add_location: bool = True,
|
|
710
695
|
add_content: bool = True,
|
|
711
|
-
add_page_index: bool = True,
|
|
712
696
|
):
|
|
713
697
|
r"""Export text element to document tokens format.
|
|
714
698
|
|
|
715
699
|
:param doc: "DoclingDocument":
|
|
716
|
-
:param new_line: str
|
|
717
|
-
:param xsize: int: (Default value =
|
|
718
|
-
:param ysize: int: (Default value =
|
|
700
|
+
:param new_line: str (Default value = "")
|
|
701
|
+
:param xsize: int: (Default value = 500)
|
|
702
|
+
:param ysize: int: (Default value = 500)
|
|
719
703
|
:param add_location: bool: (Default value = True)
|
|
720
704
|
:param add_content: bool: (Default value = True)
|
|
721
|
-
:param add_page_index: bool: (Default value = True)
|
|
722
705
|
|
|
723
706
|
"""
|
|
724
|
-
body = f"<{self.label.value}_level_{self.level}>"
|
|
707
|
+
body = f"<{self.label.value}_level_{self.level}>{new_line}"
|
|
725
708
|
|
|
726
709
|
# TODO: This must be done through an explicit mapping.
|
|
727
710
|
# assert DocumentToken.is_known_token(
|
|
@@ -731,16 +714,15 @@ class SectionHeaderItem(TextItem):
|
|
|
731
714
|
if add_location:
|
|
732
715
|
body += self.get_location_tokens(
|
|
733
716
|
doc=doc,
|
|
734
|
-
new_line=
|
|
717
|
+
new_line=new_line,
|
|
735
718
|
xsize=xsize,
|
|
736
719
|
ysize=ysize,
|
|
737
|
-
add_page_index=add_page_index,
|
|
738
720
|
)
|
|
739
721
|
|
|
740
722
|
if add_content and self.text is not None:
|
|
741
|
-
body += self.text.strip()
|
|
723
|
+
body += f"{self.text.strip()}{new_line}"
|
|
742
724
|
|
|
743
|
-
body += f"</{self.label.value}_level_{self.level}
|
|
725
|
+
body += f"</{self.label.value}_level_{self.level}>\n"
|
|
744
726
|
|
|
745
727
|
return body
|
|
746
728
|
|
|
@@ -785,6 +767,51 @@ class FloatingItem(DocItem):
|
|
|
785
767
|
return super().get_image(doc=doc)
|
|
786
768
|
|
|
787
769
|
|
|
770
|
+
class CodeItem(FloatingItem, TextItem):
|
|
771
|
+
"""CodeItem."""
|
|
772
|
+
|
|
773
|
+
label: typing.Literal[DocItemLabel.CODE] = (
|
|
774
|
+
DocItemLabel.CODE # type: ignore[assignment]
|
|
775
|
+
)
|
|
776
|
+
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
|
|
777
|
+
|
|
778
|
+
def export_to_document_tokens(
|
|
779
|
+
self,
|
|
780
|
+
doc: "DoclingDocument",
|
|
781
|
+
new_line: str = "",
|
|
782
|
+
xsize: int = 500,
|
|
783
|
+
ysize: int = 500,
|
|
784
|
+
add_location: bool = True,
|
|
785
|
+
add_content: bool = True,
|
|
786
|
+
):
|
|
787
|
+
r"""Export text element to document tokens format.
|
|
788
|
+
|
|
789
|
+
:param doc: "DoclingDocument":
|
|
790
|
+
:param new_line: str (Default value = "")
|
|
791
|
+
:param xsize: int: (Default value = 500)
|
|
792
|
+
:param ysize: int: (Default value = 500)
|
|
793
|
+
:param add_location: bool: (Default value = True)
|
|
794
|
+
:param add_content: bool: (Default value = True)
|
|
795
|
+
|
|
796
|
+
"""
|
|
797
|
+
body = f"<{self.label.value}{new_line}"
|
|
798
|
+
|
|
799
|
+
if add_location:
|
|
800
|
+
body += self.get_location_tokens(
|
|
801
|
+
doc=doc,
|
|
802
|
+
new_line=new_line,
|
|
803
|
+
xsize=xsize,
|
|
804
|
+
ysize=ysize,
|
|
805
|
+
)
|
|
806
|
+
|
|
807
|
+
if add_content and self.text is not None:
|
|
808
|
+
body += f"<_{self.code_language.value}_>{self.text}{new_line}"
|
|
809
|
+
|
|
810
|
+
body += f"</{self.label.value}\n"
|
|
811
|
+
|
|
812
|
+
return body
|
|
813
|
+
|
|
814
|
+
|
|
788
815
|
class PictureItem(FloatingItem):
|
|
789
816
|
"""PictureItem."""
|
|
790
817
|
|
|
@@ -931,47 +958,62 @@ class PictureItem(FloatingItem):
|
|
|
931
958
|
def export_to_document_tokens(
|
|
932
959
|
self,
|
|
933
960
|
doc: "DoclingDocument",
|
|
934
|
-
new_line: str = "
|
|
935
|
-
xsize: int =
|
|
936
|
-
ysize: int =
|
|
961
|
+
new_line: str = "",
|
|
962
|
+
xsize: int = 500,
|
|
963
|
+
ysize: int = 500,
|
|
937
964
|
add_location: bool = True,
|
|
938
965
|
add_caption: bool = True,
|
|
939
966
|
add_content: bool = True, # not used at the moment
|
|
940
|
-
add_page_index: bool = True,
|
|
941
967
|
):
|
|
942
968
|
r"""Export picture to document tokens format.
|
|
943
969
|
|
|
944
970
|
:param doc: "DoclingDocument":
|
|
945
|
-
:param new_line: str
|
|
946
|
-
:param xsize: int: (Default value =
|
|
947
|
-
:param ysize: int: (Default value =
|
|
971
|
+
:param new_line: str (Default value = "")
|
|
972
|
+
:param xsize: int: (Default value = 500)
|
|
973
|
+
:param ysize: int: (Default value = 500)
|
|
948
974
|
:param add_location: bool: (Default value = True)
|
|
949
975
|
:param add_caption: bool: (Default value = True)
|
|
950
976
|
:param add_content: bool: (Default value = True)
|
|
951
|
-
:param # not used at the
|
|
977
|
+
:param # not used at the moment
|
|
952
978
|
|
|
953
979
|
"""
|
|
954
|
-
body = f"{
|
|
955
|
-
|
|
980
|
+
body = f"<{self.label.value}>{new_line}"
|
|
956
981
|
if add_location:
|
|
957
982
|
body += self.get_location_tokens(
|
|
958
983
|
doc=doc,
|
|
959
984
|
new_line=new_line,
|
|
960
985
|
xsize=xsize,
|
|
961
986
|
ysize=ysize,
|
|
962
|
-
add_page_index=add_page_index,
|
|
963
987
|
)
|
|
964
988
|
|
|
989
|
+
classifications = [
|
|
990
|
+
ann
|
|
991
|
+
for ann in self.annotations
|
|
992
|
+
if isinstance(ann, PictureClassificationData)
|
|
993
|
+
]
|
|
994
|
+
if len(classifications) > 0:
|
|
995
|
+
# ! TODO: currently this code assumes class_name is of type 'str'
|
|
996
|
+
# ! TODO: when it will change to an ENUM --> adapt code
|
|
997
|
+
predicted_class = classifications[0].predicted_classes[0].class_name
|
|
998
|
+
body += DocumentToken.get_picture_classification_token(predicted_class)
|
|
999
|
+
|
|
965
1000
|
if add_caption and len(self.captions):
|
|
966
1001
|
text = self.caption_text(doc)
|
|
967
1002
|
|
|
968
1003
|
if len(text):
|
|
969
|
-
body += f"{
|
|
1004
|
+
body += f"<{DocItemLabel.CAPTION.value}>"
|
|
1005
|
+
for caption in self.captions:
|
|
1006
|
+
body += caption.resolve(doc).get_location_tokens(
|
|
1007
|
+
doc=doc,
|
|
1008
|
+
new_line=new_line,
|
|
1009
|
+
xsize=xsize,
|
|
1010
|
+
ysize=ysize,
|
|
1011
|
+
)
|
|
970
1012
|
body += f"{text.strip()}"
|
|
971
|
-
body += f"{
|
|
1013
|
+
body += f"</{DocItemLabel.CAPTION.value}>"
|
|
972
1014
|
body += f"{new_line}"
|
|
973
1015
|
|
|
974
|
-
body += f"{
|
|
1016
|
+
body += f"</{self.label.value}>\n"
|
|
975
1017
|
|
|
976
1018
|
return body
|
|
977
1019
|
|
|
@@ -1143,8 +1185,8 @@ class TableItem(FloatingItem):
|
|
|
1143
1185
|
doc: "DoclingDocument",
|
|
1144
1186
|
add_cell_location: bool = True,
|
|
1145
1187
|
add_cell_text: bool = True,
|
|
1146
|
-
xsize: int =
|
|
1147
|
-
ysize: int =
|
|
1188
|
+
xsize: int = 500,
|
|
1189
|
+
ysize: int = 500,
|
|
1148
1190
|
) -> str:
|
|
1149
1191
|
"""Export the table as OTSL."""
|
|
1150
1192
|
# Possible OTSL tokens...
|
|
@@ -1194,7 +1236,6 @@ class TableItem(FloatingItem):
|
|
|
1194
1236
|
page_h=page_h,
|
|
1195
1237
|
xsize=xsize,
|
|
1196
1238
|
ysize=ysize,
|
|
1197
|
-
page_i=page_no,
|
|
1198
1239
|
)
|
|
1199
1240
|
|
|
1200
1241
|
if rowstart == i and colstart == j:
|
|
@@ -1234,33 +1275,29 @@ class TableItem(FloatingItem):
|
|
|
1234
1275
|
def export_to_document_tokens(
|
|
1235
1276
|
self,
|
|
1236
1277
|
doc: "DoclingDocument",
|
|
1237
|
-
new_line: str = "
|
|
1238
|
-
xsize: int =
|
|
1239
|
-
ysize: int =
|
|
1278
|
+
new_line: str = "",
|
|
1279
|
+
xsize: int = 500,
|
|
1280
|
+
ysize: int = 500,
|
|
1240
1281
|
add_location: bool = True,
|
|
1241
|
-
add_caption: bool = True,
|
|
1242
|
-
add_content: bool = True,
|
|
1243
1282
|
add_cell_location: bool = True,
|
|
1244
|
-
add_cell_label: bool = True,
|
|
1245
1283
|
add_cell_text: bool = True,
|
|
1246
|
-
|
|
1284
|
+
add_caption: bool = True,
|
|
1247
1285
|
):
|
|
1248
1286
|
r"""Export table to document tokens format.
|
|
1249
1287
|
|
|
1250
1288
|
:param doc: "DoclingDocument":
|
|
1251
|
-
:param new_line: str
|
|
1252
|
-
:param xsize: int: (Default value =
|
|
1253
|
-
:param ysize: int: (Default value =
|
|
1289
|
+
:param new_line: str (Default value = "")
|
|
1290
|
+
:param xsize: int: (Default value = 500)
|
|
1291
|
+
:param ysize: int: (Default value = 500)
|
|
1254
1292
|
:param add_location: bool: (Default value = True)
|
|
1255
|
-
:param add_caption: bool: (Default value = True)
|
|
1256
|
-
:param add_content: bool: (Default value = True)
|
|
1257
1293
|
:param add_cell_location: bool: (Default value = True)
|
|
1258
|
-
:param add_cell_label: bool: (Default value = True)
|
|
1259
1294
|
:param add_cell_text: bool: (Default value = True)
|
|
1260
|
-
:param
|
|
1295
|
+
:param add_caption: bool: (Default value = True)
|
|
1261
1296
|
|
|
1262
1297
|
"""
|
|
1263
|
-
|
|
1298
|
+
otsl_tag = DocumentToken.OTSL.value
|
|
1299
|
+
|
|
1300
|
+
body = f"<{otsl_tag}>{new_line}"
|
|
1264
1301
|
|
|
1265
1302
|
if add_location:
|
|
1266
1303
|
body += self.get_location_tokens(
|
|
@@ -1268,76 +1305,27 @@ class TableItem(FloatingItem):
|
|
|
1268
1305
|
new_line=new_line,
|
|
1269
1306
|
xsize=xsize,
|
|
1270
1307
|
ysize=ysize,
|
|
1271
|
-
add_page_index=add_page_index,
|
|
1272
1308
|
)
|
|
1273
1309
|
|
|
1310
|
+
body += self.export_to_otsl(doc, add_cell_location, add_cell_text, xsize, ysize)
|
|
1311
|
+
|
|
1274
1312
|
if add_caption and len(self.captions):
|
|
1275
1313
|
text = self.caption_text(doc)
|
|
1276
1314
|
|
|
1277
1315
|
if len(text):
|
|
1278
|
-
body += f"{
|
|
1316
|
+
body += f"<{DocItemLabel.CAPTION.value}>"
|
|
1317
|
+
for caption in self.captions:
|
|
1318
|
+
body += caption.resolve(doc).get_location_tokens(
|
|
1319
|
+
doc=doc,
|
|
1320
|
+
new_line=new_line,
|
|
1321
|
+
xsize=xsize,
|
|
1322
|
+
ysize=ysize,
|
|
1323
|
+
)
|
|
1279
1324
|
body += f"{text.strip()}"
|
|
1280
|
-
body += f"{
|
|
1325
|
+
body += f"</{DocItemLabel.CAPTION.value}>"
|
|
1281
1326
|
body += f"{new_line}"
|
|
1282
1327
|
|
|
1283
|
-
|
|
1284
|
-
for i, row in enumerate(self.data.grid):
|
|
1285
|
-
body += f"<row_{i}>"
|
|
1286
|
-
for j, col in enumerate(row):
|
|
1287
|
-
|
|
1288
|
-
text = ""
|
|
1289
|
-
if add_cell_text:
|
|
1290
|
-
text = col.text.strip()
|
|
1291
|
-
|
|
1292
|
-
cell_loc = ""
|
|
1293
|
-
if (
|
|
1294
|
-
col.bbox is not None
|
|
1295
|
-
and add_cell_location
|
|
1296
|
-
and add_page_index
|
|
1297
|
-
and len(self.prov) > 0
|
|
1298
|
-
):
|
|
1299
|
-
page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
|
|
1300
|
-
cell_loc = DocumentToken.get_location(
|
|
1301
|
-
bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
1302
|
-
page_w=page_w,
|
|
1303
|
-
page_h=page_h,
|
|
1304
|
-
xsize=xsize,
|
|
1305
|
-
ysize=ysize,
|
|
1306
|
-
page_i=self.prov[0].page_no,
|
|
1307
|
-
)
|
|
1308
|
-
elif (
|
|
1309
|
-
col.bbox is not None
|
|
1310
|
-
and add_cell_location
|
|
1311
|
-
and not add_page_index
|
|
1312
|
-
and len(self.prov) > 0
|
|
1313
|
-
):
|
|
1314
|
-
page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
|
|
1315
|
-
|
|
1316
|
-
cell_loc = DocumentToken.get_location(
|
|
1317
|
-
bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
1318
|
-
page_w=page_w,
|
|
1319
|
-
page_h=page_h,
|
|
1320
|
-
xsize=xsize,
|
|
1321
|
-
ysize=ysize,
|
|
1322
|
-
page_i=-1,
|
|
1323
|
-
)
|
|
1324
|
-
|
|
1325
|
-
cell_label = ""
|
|
1326
|
-
if add_cell_label:
|
|
1327
|
-
if col.column_header:
|
|
1328
|
-
cell_label = "<col_header>"
|
|
1329
|
-
elif col.row_header:
|
|
1330
|
-
cell_label = "<row_header>"
|
|
1331
|
-
elif col.row_section:
|
|
1332
|
-
cell_label = "<row_section>"
|
|
1333
|
-
else:
|
|
1334
|
-
cell_label = "<body>"
|
|
1335
|
-
|
|
1336
|
-
body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
|
|
1337
|
-
|
|
1338
|
-
body += f"</row_{i}>{new_line}"
|
|
1339
|
-
|
|
1340
|
-
body += f"{DocumentToken.END_TABLE.value}{new_line}"
|
|
1328
|
+
body += f"</{otsl_tag}>\n"
|
|
1341
1329
|
|
|
1342
1330
|
return body
|
|
1343
1331
|
|
|
@@ -1777,6 +1765,7 @@ class DoclingDocument(BaseModel):
|
|
|
1777
1765
|
text: str,
|
|
1778
1766
|
code_language: Optional[CodeLanguageLabel] = None,
|
|
1779
1767
|
orig: Optional[str] = None,
|
|
1768
|
+
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
1780
1769
|
prov: Optional[ProvenanceItem] = None,
|
|
1781
1770
|
parent: Optional[NodeItem] = None,
|
|
1782
1771
|
content_layer: Optional[ContentLayer] = None,
|
|
@@ -1786,6 +1775,8 @@ class DoclingDocument(BaseModel):
|
|
|
1786
1775
|
:param text: str:
|
|
1787
1776
|
:param code_language: Optional[str]: (Default value = None)
|
|
1788
1777
|
:param orig: Optional[str]: (Default value = None)
|
|
1778
|
+
:param caption: Optional[Union[TextItem:
|
|
1779
|
+
:param RefItem]]: (Default value = None)
|
|
1789
1780
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1790
1781
|
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1791
1782
|
"""
|
|
@@ -1809,6 +1800,8 @@ class DoclingDocument(BaseModel):
|
|
|
1809
1800
|
code_item.content_layer = content_layer
|
|
1810
1801
|
if prov:
|
|
1811
1802
|
code_item.prov.append(prov)
|
|
1803
|
+
if caption:
|
|
1804
|
+
code_item.captions.append(caption.get_ref())
|
|
1812
1805
|
|
|
1813
1806
|
self.texts.append(code_item)
|
|
1814
1807
|
parent.children.append(RefItem(cref=cref))
|
|
@@ -1927,6 +1920,7 @@ class DoclingDocument(BaseModel):
|
|
|
1927
1920
|
traverse_pictures=traverse_pictures,
|
|
1928
1921
|
page_no=page_no,
|
|
1929
1922
|
_level=_level + 1,
|
|
1923
|
+
included_content_layers=included_content_layers,
|
|
1930
1924
|
)
|
|
1931
1925
|
|
|
1932
1926
|
def _clear_picture_pil_cache(self):
|
|
@@ -2132,6 +2126,7 @@ class DoclingDocument(BaseModel):
|
|
|
2132
2126
|
indent: int = 4,
|
|
2133
2127
|
text_width: int = -1,
|
|
2134
2128
|
page_no: Optional[int] = None,
|
|
2129
|
+
included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
|
|
2135
2130
|
):
|
|
2136
2131
|
"""Save to markdown."""
|
|
2137
2132
|
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
@@ -2155,6 +2150,7 @@ class DoclingDocument(BaseModel):
|
|
|
2155
2150
|
indent=indent,
|
|
2156
2151
|
text_width=text_width,
|
|
2157
2152
|
page_no=page_no,
|
|
2153
|
+
included_content_layers=included_content_layers,
|
|
2158
2154
|
)
|
|
2159
2155
|
|
|
2160
2156
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
@@ -2173,6 +2169,7 @@ class DoclingDocument(BaseModel):
|
|
|
2173
2169
|
indent: int = 4,
|
|
2174
2170
|
text_width: int = -1,
|
|
2175
2171
|
page_no: Optional[int] = None,
|
|
2172
|
+
included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
|
|
2176
2173
|
) -> str:
|
|
2177
2174
|
r"""Serialize to Markdown.
|
|
2178
2175
|
|
|
@@ -2254,7 +2251,12 @@ class DoclingDocument(BaseModel):
|
|
|
2254
2251
|
mdtexts.append(text)
|
|
2255
2252
|
|
|
2256
2253
|
for ix, (item, level) in enumerate(
|
|
2257
|
-
self.iterate_items(
|
|
2254
|
+
self.iterate_items(
|
|
2255
|
+
self.body,
|
|
2256
|
+
with_groups=True,
|
|
2257
|
+
page_no=page_no,
|
|
2258
|
+
included_content_layers=included_content_layers,
|
|
2259
|
+
)
|
|
2258
2260
|
):
|
|
2259
2261
|
# If we've moved to a lower level, we're exiting one or more groups
|
|
2260
2262
|
if level < previous_level:
|
|
@@ -2423,6 +2425,7 @@ class DoclingDocument(BaseModel):
|
|
|
2423
2425
|
page_no: Optional[int] = None,
|
|
2424
2426
|
html_lang: str = "en",
|
|
2425
2427
|
html_head: str = _HTML_DEFAULT_HEAD,
|
|
2428
|
+
included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
|
|
2426
2429
|
):
|
|
2427
2430
|
"""Save to HTML."""
|
|
2428
2431
|
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
@@ -2443,6 +2446,7 @@ class DoclingDocument(BaseModel):
|
|
|
2443
2446
|
page_no=page_no,
|
|
2444
2447
|
html_lang=html_lang,
|
|
2445
2448
|
html_head=html_head,
|
|
2449
|
+
included_content_layers=included_content_layers,
|
|
2446
2450
|
)
|
|
2447
2451
|
|
|
2448
2452
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
@@ -2490,6 +2494,7 @@ class DoclingDocument(BaseModel):
|
|
|
2490
2494
|
page_no: Optional[int] = None,
|
|
2491
2495
|
html_lang: str = "en",
|
|
2492
2496
|
html_head: str = _HTML_DEFAULT_HEAD,
|
|
2497
|
+
included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
|
|
2493
2498
|
) -> str:
|
|
2494
2499
|
r"""Serialize to HTML."""
|
|
2495
2500
|
|
|
@@ -2531,7 +2536,12 @@ class DoclingDocument(BaseModel):
|
|
|
2531
2536
|
return text
|
|
2532
2537
|
|
|
2533
2538
|
for ix, (item, curr_level) in enumerate(
|
|
2534
|
-
self.iterate_items(
|
|
2539
|
+
self.iterate_items(
|
|
2540
|
+
self.body,
|
|
2541
|
+
with_groups=True,
|
|
2542
|
+
page_no=page_no,
|
|
2543
|
+
included_content_layers=included_content_layers,
|
|
2544
|
+
)
|
|
2535
2545
|
):
|
|
2536
2546
|
# If we've moved to a lower level, we're exiting one or more groups
|
|
2537
2547
|
if curr_level < prev_level and len(in_ordered_list) > 0:
|
|
@@ -2708,22 +2718,18 @@ class DoclingDocument(BaseModel):
|
|
|
2708
2718
|
def save_as_document_tokens(
|
|
2709
2719
|
self,
|
|
2710
2720
|
filename: Path,
|
|
2711
|
-
delim: str = "
|
|
2721
|
+
delim: str = "",
|
|
2712
2722
|
from_element: int = 0,
|
|
2713
2723
|
to_element: int = sys.maxsize,
|
|
2714
|
-
labels: set[DocItemLabel] =
|
|
2715
|
-
xsize: int =
|
|
2716
|
-
ysize: int =
|
|
2724
|
+
labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
2725
|
+
xsize: int = 500,
|
|
2726
|
+
ysize: int = 500,
|
|
2717
2727
|
add_location: bool = True,
|
|
2718
2728
|
add_content: bool = True,
|
|
2719
2729
|
add_page_index: bool = True,
|
|
2720
2730
|
# table specific flags
|
|
2721
2731
|
add_table_cell_location: bool = False,
|
|
2722
|
-
add_table_cell_label: bool = True,
|
|
2723
2732
|
add_table_cell_text: bool = True,
|
|
2724
|
-
# specifics
|
|
2725
|
-
page_no: Optional[int] = None,
|
|
2726
|
-
with_groups: bool = True,
|
|
2727
2733
|
):
|
|
2728
2734
|
r"""Save the document content to a DocumentToken format."""
|
|
2729
2735
|
out = self.export_to_document_tokens(
|
|
@@ -2738,198 +2744,230 @@ class DoclingDocument(BaseModel):
|
|
|
2738
2744
|
add_page_index=add_page_index,
|
|
2739
2745
|
# table specific flags
|
|
2740
2746
|
add_table_cell_location=add_table_cell_location,
|
|
2741
|
-
add_table_cell_label=add_table_cell_label,
|
|
2742
2747
|
add_table_cell_text=add_table_cell_text,
|
|
2743
|
-
# specifics
|
|
2744
|
-
page_no=page_no,
|
|
2745
|
-
with_groups=with_groups,
|
|
2746
2748
|
)
|
|
2747
2749
|
|
|
2748
2750
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
2749
2751
|
fw.write(out)
|
|
2750
2752
|
|
|
2751
|
-
def export_to_document_tokens(
|
|
2753
|
+
def export_to_document_tokens( # noqa: C901
|
|
2752
2754
|
self,
|
|
2753
|
-
delim: str = "
|
|
2755
|
+
delim: str = "",
|
|
2754
2756
|
from_element: int = 0,
|
|
2755
2757
|
to_element: int = sys.maxsize,
|
|
2756
|
-
labels: set[DocItemLabel] =
|
|
2757
|
-
xsize: int =
|
|
2758
|
-
ysize: int =
|
|
2758
|
+
labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
2759
|
+
xsize: int = 500,
|
|
2760
|
+
ysize: int = 500,
|
|
2759
2761
|
add_location: bool = True,
|
|
2760
2762
|
add_content: bool = True,
|
|
2761
2763
|
add_page_index: bool = True,
|
|
2762
2764
|
# table specific flags
|
|
2763
2765
|
add_table_cell_location: bool = False,
|
|
2764
|
-
add_table_cell_label: bool = True,
|
|
2765
2766
|
add_table_cell_text: bool = True,
|
|
2766
|
-
# specifics
|
|
2767
|
-
page_no: Optional[int] = None,
|
|
2768
|
-
with_groups: bool = True,
|
|
2769
|
-
newline: bool = True,
|
|
2770
2767
|
) -> str:
|
|
2771
2768
|
r"""Exports the document content to a DocumentToken format.
|
|
2772
2769
|
|
|
2773
2770
|
Operates on a slice of the document's body as defined through arguments
|
|
2774
2771
|
from_element and to_element; defaulting to the whole main_text.
|
|
2775
2772
|
|
|
2776
|
-
:param delim: str: (Default value = "
|
|
2773
|
+
:param delim: str: (Default value = "")
|
|
2777
2774
|
:param from_element: int: (Default value = 0)
|
|
2778
2775
|
:param to_element: Optional[int]: (Default value = None)
|
|
2779
2776
|
:param labels: set[DocItemLabel]
|
|
2780
|
-
:param xsize: int: (Default value =
|
|
2781
|
-
:param ysize: int: (Default value =
|
|
2777
|
+
:param xsize: int: (Default value = 500)
|
|
2778
|
+
:param ysize: int: (Default value = 500)
|
|
2782
2779
|
:param add_location: bool: (Default value = True)
|
|
2783
2780
|
:param add_content: bool: (Default value = True)
|
|
2784
2781
|
:param add_page_index: bool: (Default value = True)
|
|
2785
2782
|
:param # table specific flagsadd_table_cell_location: bool
|
|
2786
|
-
:param add_table_cell_label: bool: (Default value = True)
|
|
2787
2783
|
:param add_table_cell_text: bool: (Default value = True)
|
|
2788
2784
|
:returns: The content of the document formatted as a DocTags string.
|
|
2789
2785
|
:rtype: str
|
|
2790
2786
|
"""
|
|
2791
2787
|
|
|
2792
|
-
def
|
|
2793
|
-
|
|
2794
|
-
|
|
2795
|
-
|
|
2796
|
-
|
|
2797
|
-
|
|
2798
|
-
|
|
2799
|
-
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
while curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2804
|
-
if in_ordered_list[-1]:
|
|
2805
|
-
result += f"</ordered_list>{delim}"
|
|
2788
|
+
def _close_lists(
|
|
2789
|
+
current_level: int,
|
|
2790
|
+
previous_level: int,
|
|
2791
|
+
ordered_list_stack: List[bool],
|
|
2792
|
+
output_parts: List[str],
|
|
2793
|
+
) -> List[bool]:
|
|
2794
|
+
"""Close open list tags until the nesting level matches item's level."""
|
|
2795
|
+
while current_level < previous_level and ordered_list_stack:
|
|
2796
|
+
last_is_ordered = ordered_list_stack.pop()
|
|
2797
|
+
if last_is_ordered:
|
|
2798
|
+
output_parts.append(f"</{DocumentToken.ORDERED_LIST.value}>\n")
|
|
2806
2799
|
else:
|
|
2807
|
-
|
|
2808
|
-
|
|
2809
|
-
|
|
2810
|
-
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
|
|
2814
|
-
|
|
2815
|
-
|
|
2816
|
-
else:
|
|
2817
|
-
delim = ""
|
|
2818
|
-
|
|
2819
|
-
prev_level = 0 # Track the previous item's level
|
|
2820
|
-
|
|
2821
|
-
in_ordered_list: List[bool] = [] # False
|
|
2822
|
-
|
|
2823
|
-
result = f"{DocumentToken.BEG_DOCUMENT.value}{delim}"
|
|
2824
|
-
|
|
2825
|
-
for ix, (item, curr_level) in enumerate(
|
|
2826
|
-
self.iterate_items(self.body, with_groups=True)
|
|
2800
|
+
output_parts.append(f"</{DocumentToken.UNORDERED_LIST.value}>\n")
|
|
2801
|
+
previous_level -= 1
|
|
2802
|
+
return ordered_list_stack
|
|
2803
|
+
|
|
2804
|
+
def _add_page_break_if_needed(
|
|
2805
|
+
output_parts: List[str],
|
|
2806
|
+
item,
|
|
2807
|
+
prev_page_no,
|
|
2808
|
+
page_break_enabled: bool,
|
|
2827
2809
|
):
|
|
2828
|
-
|
|
2829
|
-
|
|
2830
|
-
if
|
|
2831
|
-
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
2835
|
-
|
|
2836
|
-
|
|
2837
|
-
|
|
2838
|
-
|
|
2839
|
-
|
|
2840
|
-
|
|
2841
|
-
|
|
2810
|
+
"""Inserts a page-break token.
|
|
2811
|
+
|
|
2812
|
+
Inserts a page-break token if the item's page number is different
|
|
2813
|
+
from the previous item and page breaks are enabled.
|
|
2814
|
+
Returns the updated output_parts list and the current page number.
|
|
2815
|
+
"""
|
|
2816
|
+
if not page_break_enabled:
|
|
2817
|
+
return output_parts, prev_page_no
|
|
2818
|
+
|
|
2819
|
+
if not item.prov:
|
|
2820
|
+
return output_parts, prev_page_no
|
|
2821
|
+
|
|
2822
|
+
current_page_no = item.prov[0].page_no
|
|
2823
|
+
if prev_page_no is None:
|
|
2824
|
+
return output_parts, current_page_no
|
|
2825
|
+
|
|
2826
|
+
if current_page_no != prev_page_no:
|
|
2827
|
+
output_parts.append(f"<{DocumentToken.PAGE_BREAK.value}>\n")
|
|
2828
|
+
|
|
2829
|
+
return output_parts, current_page_no
|
|
2830
|
+
|
|
2831
|
+
def _get_standalone_captions(document_body):
|
|
2832
|
+
"""Identify captions that are not attached to any table or figure."""
|
|
2833
|
+
all_captions = set()
|
|
2834
|
+
matched_captions = set()
|
|
2835
|
+
for item, _ in self.iterate_items(document_body, with_groups=True):
|
|
2836
|
+
if item.label == DocItemLabel.CAPTION:
|
|
2837
|
+
all_captions.update([item.self_ref])
|
|
2838
|
+
if item.label in [DocItemLabel.PICTURE, DocItemLabel.TABLE]:
|
|
2839
|
+
matched_captions.update([caption.cref for caption in item.captions])
|
|
2840
|
+
|
|
2841
|
+
return all_captions - matched_captions
|
|
2842
|
+
|
|
2843
|
+
# Initialization
|
|
2844
|
+
output_parts: List[str] = []
|
|
2845
|
+
ordered_list_stack: List[bool] = []
|
|
2846
|
+
previous_level = 0
|
|
2847
|
+
previous_page_no = None
|
|
2848
|
+
|
|
2849
|
+
# Precompute standalone captions
|
|
2850
|
+
standalone_captions = _get_standalone_captions(self.body)
|
|
2851
|
+
|
|
2852
|
+
# Begin document
|
|
2853
|
+
output_parts.append(f"<{DocumentToken.DOCUMENT.value}>{delim}")
|
|
2854
|
+
|
|
2855
|
+
for ix, (item, current_level) in enumerate(
|
|
2856
|
+
self.iterate_items(
|
|
2857
|
+
self.body,
|
|
2858
|
+
with_groups=True,
|
|
2859
|
+
included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE},
|
|
2860
|
+
)
|
|
2861
|
+
):
|
|
2862
|
+
# Close lists if we've moved to a lower nesting level
|
|
2863
|
+
if current_level < previous_level and ordered_list_stack:
|
|
2864
|
+
ordered_list_stack = _close_lists(
|
|
2865
|
+
current_level, previous_level, ordered_list_stack, output_parts
|
|
2842
2866
|
)
|
|
2867
|
+
previous_level = current_level
|
|
2843
2868
|
|
|
2844
|
-
|
|
2845
|
-
|
|
2846
|
-
|
|
2847
|
-
continue # skip as many items as you want
|
|
2848
|
-
|
|
2849
|
-
if (isinstance(item, DocItem)) and (item.label not in labels):
|
|
2850
|
-
continue # skip any label that is not whitelisted
|
|
2851
|
-
|
|
2852
|
-
if isinstance(item, GroupItem) and item.label in [
|
|
2853
|
-
GroupLabel.ORDERED_LIST,
|
|
2854
|
-
]:
|
|
2869
|
+
# Skip items outside the specified element range
|
|
2870
|
+
if ix < from_element or ix >= to_element:
|
|
2871
|
+
continue
|
|
2855
2872
|
|
|
2856
|
-
|
|
2857
|
-
|
|
2873
|
+
# Skip items whose label is not in the allowed set
|
|
2874
|
+
if isinstance(item, DocItem) and (item.label not in labels):
|
|
2875
|
+
continue
|
|
2858
2876
|
|
|
2859
|
-
|
|
2860
|
-
|
|
2861
|
-
|
|
2877
|
+
# Skip captions that are not standalone as they will be included below
|
|
2878
|
+
# by the export functions of Table and Picture
|
|
2879
|
+
if (
|
|
2880
|
+
isinstance(item, TextItem)
|
|
2881
|
+
and item.label == DocItemLabel.CAPTION
|
|
2882
|
+
and item.self_ref not in standalone_captions
|
|
2883
|
+
):
|
|
2884
|
+
continue
|
|
2862
2885
|
|
|
2863
|
-
|
|
2864
|
-
|
|
2886
|
+
# Handle list groups
|
|
2887
|
+
if isinstance(item, GroupItem):
|
|
2888
|
+
if item.label == GroupLabel.ORDERED_LIST:
|
|
2889
|
+
output_parts.append(f"<{DocumentToken.ORDERED_LIST.value}>{delim}")
|
|
2890
|
+
ordered_list_stack.append(True)
|
|
2891
|
+
elif item.label == GroupLabel.LIST:
|
|
2892
|
+
output_parts.append(
|
|
2893
|
+
f"<{DocumentToken.UNORDERED_LIST.value}>{delim}"
|
|
2894
|
+
)
|
|
2895
|
+
ordered_list_stack.append(False)
|
|
2896
|
+
continue
|
|
2865
2897
|
|
|
2866
|
-
|
|
2898
|
+
# For other item types, optionally insert page-break if the page changed
|
|
2899
|
+
output_parts, previous_page_no = _add_page_break_if_needed(
|
|
2900
|
+
output_parts, item, previous_page_no, add_page_index
|
|
2901
|
+
)
|
|
2867
2902
|
|
|
2868
|
-
|
|
2869
|
-
|
|
2870
|
-
|
|
2871
|
-
|
|
2872
|
-
|
|
2873
|
-
|
|
2874
|
-
|
|
2875
|
-
|
|
2903
|
+
if isinstance(item, SectionHeaderItem):
|
|
2904
|
+
output_parts.append(
|
|
2905
|
+
item.export_to_document_tokens(
|
|
2906
|
+
doc=self,
|
|
2907
|
+
new_line=delim,
|
|
2908
|
+
xsize=xsize,
|
|
2909
|
+
ysize=ysize,
|
|
2910
|
+
add_location=add_location,
|
|
2911
|
+
add_content=add_content,
|
|
2912
|
+
)
|
|
2876
2913
|
)
|
|
2877
|
-
elif isinstance(item, CodeItem)
|
|
2878
|
-
|
|
2879
|
-
|
|
2880
|
-
|
|
2881
|
-
|
|
2882
|
-
|
|
2883
|
-
|
|
2884
|
-
|
|
2885
|
-
|
|
2886
|
-
|
|
2914
|
+
elif isinstance(item, CodeItem):
|
|
2915
|
+
output_parts.append(
|
|
2916
|
+
item.export_to_document_tokens(
|
|
2917
|
+
doc=self,
|
|
2918
|
+
new_line=delim,
|
|
2919
|
+
xsize=xsize,
|
|
2920
|
+
ysize=ysize,
|
|
2921
|
+
add_location=add_location,
|
|
2922
|
+
add_content=add_content,
|
|
2923
|
+
)
|
|
2887
2924
|
)
|
|
2888
|
-
|
|
2889
|
-
|
|
2890
|
-
|
|
2891
|
-
|
|
2892
|
-
|
|
2893
|
-
|
|
2894
|
-
|
|
2895
|
-
|
|
2896
|
-
|
|
2897
|
-
|
|
2898
|
-
add_page_index=add_page_index,
|
|
2925
|
+
elif isinstance(item, TextItem):
|
|
2926
|
+
output_parts.append(
|
|
2927
|
+
item.export_to_document_tokens(
|
|
2928
|
+
doc=self,
|
|
2929
|
+
new_line=delim,
|
|
2930
|
+
xsize=xsize,
|
|
2931
|
+
ysize=ysize,
|
|
2932
|
+
add_location=add_location,
|
|
2933
|
+
add_content=add_content,
|
|
2934
|
+
)
|
|
2899
2935
|
)
|
|
2900
|
-
|
|
2901
|
-
|
|
2902
|
-
|
|
2903
|
-
|
|
2904
|
-
|
|
2905
|
-
|
|
2906
|
-
|
|
2907
|
-
|
|
2908
|
-
|
|
2909
|
-
|
|
2910
|
-
|
|
2911
|
-
|
|
2912
|
-
add_cell_label=add_table_cell_label,
|
|
2913
|
-
add_cell_text=add_table_cell_text,
|
|
2914
|
-
add_page_index=add_page_index,
|
|
2936
|
+
elif isinstance(item, TableItem):
|
|
2937
|
+
output_parts.append(
|
|
2938
|
+
item.export_to_document_tokens(
|
|
2939
|
+
doc=self,
|
|
2940
|
+
new_line=delim,
|
|
2941
|
+
xsize=xsize,
|
|
2942
|
+
ysize=ysize,
|
|
2943
|
+
add_location=add_location,
|
|
2944
|
+
add_cell_location=add_table_cell_location,
|
|
2945
|
+
add_cell_text=add_table_cell_text,
|
|
2946
|
+
add_caption=True,
|
|
2947
|
+
)
|
|
2915
2948
|
)
|
|
2916
|
-
|
|
2917
|
-
|
|
2918
|
-
|
|
2919
|
-
|
|
2920
|
-
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
add_page_index=add_page_index,
|
|
2949
|
+
elif isinstance(item, PictureItem):
|
|
2950
|
+
output_parts.append(
|
|
2951
|
+
item.export_to_document_tokens(
|
|
2952
|
+
doc=self,
|
|
2953
|
+
new_line=delim,
|
|
2954
|
+
xsize=xsize,
|
|
2955
|
+
ysize=ysize,
|
|
2956
|
+
add_caption=True,
|
|
2957
|
+
add_location=add_location,
|
|
2958
|
+
add_content=add_content,
|
|
2959
|
+
)
|
|
2928
2960
|
)
|
|
2929
2961
|
|
|
2930
|
-
|
|
2962
|
+
# End any lists that might still be open
|
|
2963
|
+
ordered_list_stack = _close_lists(
|
|
2964
|
+
0, previous_level, ordered_list_stack, output_parts
|
|
2965
|
+
)
|
|
2931
2966
|
|
|
2932
|
-
|
|
2967
|
+
# End document
|
|
2968
|
+
output_parts.append(f"</{DocumentToken.DOCUMENT.value}>")
|
|
2969
|
+
|
|
2970
|
+
return "".join(output_parts)
|
|
2933
2971
|
|
|
2934
2972
|
def _export_to_indented_text(
|
|
2935
2973
|
self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
|
docling_core/types/doc/labels.py
CHANGED
docling_core/types/doc/tokens.py
CHANGED
|
@@ -8,13 +8,15 @@
|
|
|
8
8
|
from enum import Enum
|
|
9
9
|
from typing import Tuple
|
|
10
10
|
|
|
11
|
+
from docling_core.types.doc.labels import PictureClassificationLabel
|
|
12
|
+
|
|
11
13
|
|
|
12
14
|
class TableToken(Enum):
|
|
13
15
|
"""Class to represent an LLM friendly representation of a Table."""
|
|
14
16
|
|
|
15
17
|
CELL_LABEL_COLUMN_HEADER = "<column_header>"
|
|
16
18
|
CELL_LABEL_ROW_HEADER = "<row_header>"
|
|
17
|
-
|
|
19
|
+
CELL_LABEL_SECTION_HEADER = "<shed>"
|
|
18
20
|
CELL_LABEL_DATA = "<data>"
|
|
19
21
|
|
|
20
22
|
OTSL_ECEL = "<ecel>" # empty cell
|
|
@@ -42,83 +44,30 @@ class TableToken(Enum):
|
|
|
42
44
|
class DocumentToken(Enum):
|
|
43
45
|
"""Class to represent an LLM friendly representation of a Document."""
|
|
44
46
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
BEG_ABSTRACT = "<abstract>"
|
|
52
|
-
END_ABSTRACT = "</abstract>"
|
|
53
|
-
|
|
54
|
-
BEG_DOI = "<doi>"
|
|
55
|
-
END_DOI = "</doi>"
|
|
56
|
-
BEG_DATE = "<date>"
|
|
57
|
-
END_DATE = "</date>"
|
|
58
|
-
|
|
59
|
-
BEG_AUTHORS = "<authors>"
|
|
60
|
-
END_AUTHORS = "</authors>"
|
|
61
|
-
BEG_AUTHOR = "<author>"
|
|
62
|
-
END_AUTHOR = "</author>"
|
|
63
|
-
|
|
64
|
-
BEG_AFFILIATIONS = "<affiliations>"
|
|
65
|
-
END_AFFILIATIONS = "</affiliations>"
|
|
66
|
-
BEG_AFFILIATION = "<affiliation>"
|
|
67
|
-
END_AFFILIATION = "</affiliation>"
|
|
68
|
-
|
|
69
|
-
BEG_HEADER = "<section-header>"
|
|
70
|
-
END_HEADER = "</section-header>"
|
|
71
|
-
BEG_TEXT = "<text>"
|
|
72
|
-
END_TEXT = "</text>"
|
|
73
|
-
BEG_PARAGRAPH = "<paragraph>"
|
|
74
|
-
END_PARAGRAPH = "</paragraph>"
|
|
75
|
-
BEG_TABLE = "<table>"
|
|
76
|
-
END_TABLE = "</table>"
|
|
77
|
-
BEG_FIGURE = "<figure>"
|
|
78
|
-
END_FIGURE = "</figure>"
|
|
79
|
-
BEG_CAPTION = "<caption>"
|
|
80
|
-
END_CAPTION = "</caption>"
|
|
81
|
-
BEG_EQUATION = "<equation>"
|
|
82
|
-
END_EQUATION = "</equation>"
|
|
83
|
-
BEG_LIST = "<list>"
|
|
84
|
-
END_LIST = "</list>"
|
|
85
|
-
BEG_LISTITEM = "<list-item>"
|
|
86
|
-
END_LISTITEM = "</list-item>"
|
|
87
|
-
|
|
88
|
-
BEG_LOCATION = "<location>"
|
|
89
|
-
END_LOCATION = "</location>"
|
|
90
|
-
BEG_GROUP = "<group>"
|
|
91
|
-
END_GROUP = "</group>"
|
|
47
|
+
DOCUMENT = "doctag"
|
|
48
|
+
OTSL = "otsl"
|
|
49
|
+
ORDERED_LIST = "ordered_list"
|
|
50
|
+
UNORDERED_LIST = "unordered_list"
|
|
51
|
+
LOC = "loc_"
|
|
52
|
+
PAGE_BREAK = "page_break"
|
|
92
53
|
|
|
93
54
|
@classmethod
|
|
94
55
|
def get_special_tokens(
|
|
95
56
|
cls,
|
|
96
|
-
max_rows: int = 100,
|
|
97
|
-
max_cols: int = 100,
|
|
98
|
-
max_pages: int = 1000,
|
|
99
57
|
page_dimension: Tuple[int, int] = (100, 100),
|
|
100
58
|
):
|
|
101
59
|
"""Function to get all special document tokens."""
|
|
102
60
|
special_tokens = [token.value for token in cls]
|
|
103
61
|
|
|
104
|
-
# Adding dynamically generated row and col tokens
|
|
105
|
-
for i in range(0, max_rows + 1):
|
|
106
|
-
special_tokens += [f"<row_{i}>", f"</row_{i}>"]
|
|
107
|
-
|
|
108
|
-
for i in range(0, max_cols + 1):
|
|
109
|
-
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
|
|
110
|
-
|
|
111
|
-
for i in range(6):
|
|
112
|
-
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
|
|
113
|
-
|
|
114
|
-
# FIXME: this is synonym of section header
|
|
115
62
|
for i in range(6):
|
|
116
|
-
special_tokens += [
|
|
63
|
+
special_tokens += [
|
|
64
|
+
f"<section_header_level_{i}>",
|
|
65
|
+
f"</section_header_level_{i}>",
|
|
66
|
+
]
|
|
117
67
|
|
|
118
|
-
#
|
|
119
|
-
for
|
|
120
|
-
special_tokens.append(f"<
|
|
121
|
-
special_tokens.append(f"</page_{i}>")
|
|
68
|
+
# Add dynamically picture classification tokens
|
|
69
|
+
for _, member in PictureClassificationLabel.__members__.items():
|
|
70
|
+
special_tokens.append(f"<{member}>")
|
|
122
71
|
|
|
123
72
|
# Adding dynamically generated location-tokens
|
|
124
73
|
for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
|
|
@@ -132,25 +81,9 @@ class DocumentToken(Enum):
|
|
|
132
81
|
return label in DocumentToken.get_special_tokens()
|
|
133
82
|
|
|
134
83
|
@staticmethod
|
|
135
|
-
def
|
|
136
|
-
"""Function to get
|
|
137
|
-
|
|
138
|
-
return f"<row_{row}>"
|
|
139
|
-
else:
|
|
140
|
-
return f"</row_{row}>"
|
|
141
|
-
|
|
142
|
-
@staticmethod
|
|
143
|
-
def get_col_token(col: int, beg=bool) -> str:
|
|
144
|
-
"""Function to get page tokens."""
|
|
145
|
-
if beg:
|
|
146
|
-
return f"<col_{col}>"
|
|
147
|
-
else:
|
|
148
|
-
return f"</col_{col}>"
|
|
149
|
-
|
|
150
|
-
@staticmethod
|
|
151
|
-
def get_page_token(page: int):
|
|
152
|
-
"""Function to get page tokens."""
|
|
153
|
-
return f"<page_{page}>"
|
|
84
|
+
def get_picture_classification_token(classification: str) -> str:
|
|
85
|
+
"""Function to get picture classification tokens."""
|
|
86
|
+
return f"<{classification}>"
|
|
154
87
|
|
|
155
88
|
@staticmethod
|
|
156
89
|
def get_location_token(val: float, rnorm: int = 100):
|
|
@@ -172,7 +105,6 @@ class DocumentToken(Enum):
|
|
|
172
105
|
page_h: float,
|
|
173
106
|
xsize: int = 100,
|
|
174
107
|
ysize: int = 100,
|
|
175
|
-
page_i: int = -1,
|
|
176
108
|
):
|
|
177
109
|
"""Get the location string give bbox and page-dim."""
|
|
178
110
|
assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
|
|
@@ -183,17 +115,11 @@ class DocumentToken(Enum):
|
|
|
183
115
|
x1 = bbox[2] / page_w
|
|
184
116
|
y1 = bbox[3] / page_h
|
|
185
117
|
|
|
186
|
-
page_tok = ""
|
|
187
|
-
if page_i != -1:
|
|
188
|
-
page_tok = DocumentToken.get_page_token(page=page_i)
|
|
189
|
-
|
|
190
118
|
x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
|
|
191
119
|
y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
|
|
192
120
|
x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
|
|
193
121
|
y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
|
|
194
122
|
|
|
195
|
-
loc_str = f"{
|
|
196
|
-
loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
|
|
197
|
-
loc_str += f"{DocumentToken.END_LOCATION.value}"
|
|
123
|
+
loc_str = f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
|
|
198
124
|
|
|
199
125
|
return loc_str
|
|
@@ -18,15 +18,15 @@ docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75
|
|
|
18
18
|
docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
|
|
19
19
|
docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
|
|
20
20
|
docling_core/transforms/chunker/base.py,sha256=BSWTiFOsF5YaZaZJZY8nwIdOXb9uufJMRIds7LxRNh8,2546
|
|
21
|
-
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=
|
|
21
|
+
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=MStDUDtzFGc6j8v9AkcAnnSHTDxdoiVrp8FTmRdGqU8,8138
|
|
22
22
|
docling_core/transforms/chunker/hybrid_chunker.py,sha256=kokjDdxjc_gygOokQwYFVnHv2NjWTgf9uex8o0ole7w,9876
|
|
23
23
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
24
24
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
25
25
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
26
26
|
docling_core/types/doc/base.py,sha256=lMRNq1DUK7K26L2VNZRqFaItCSZ6m9BdYTVaJA98PZQ,11495
|
|
27
|
-
docling_core/types/doc/document.py,sha256=
|
|
28
|
-
docling_core/types/doc/labels.py,sha256=
|
|
29
|
-
docling_core/types/doc/tokens.py,sha256=
|
|
27
|
+
docling_core/types/doc/document.py,sha256=t1nk1GeR5_YvZhuWUVZkkBekp89vFB4RBtMuwD3Acw4,104373
|
|
28
|
+
docling_core/types/doc/labels.py,sha256=cqH4DGN9lgZns6gOtL5urzZzUPGOjHJ75xQbIKSh_h8,5306
|
|
29
|
+
docling_core/types/doc/tokens.py,sha256=i73PXkmqXCLsQ5SddnJX8L9e_Ub2_K_DYSE-VE8NDq0,3925
|
|
30
30
|
docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
|
|
31
31
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
32
32
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
56
56
|
docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
|
|
57
57
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
58
58
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
59
|
-
docling_core-2.
|
|
60
|
-
docling_core-2.
|
|
61
|
-
docling_core-2.
|
|
62
|
-
docling_core-2.
|
|
63
|
-
docling_core-2.
|
|
59
|
+
docling_core-2.19.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
60
|
+
docling_core-2.19.1.dist-info/METADATA,sha256=Uz-AUOD2_itxSEVxatsPbCQ0pFBE3fMX-gXx0YLmsKw,5803
|
|
61
|
+
docling_core-2.19.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
62
|
+
docling_core-2.19.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
63
|
+
docling_core-2.19.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|