docling-core 2.18.0__py3-none-any.whl → 2.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/hierarchical_chunker.py +5 -2
- docling_core/types/doc/document.py +315 -293
- docling_core/types/doc/labels.py +1 -1
- docling_core/types/doc/tokens.py +20 -94
- {docling_core-2.18.0.dist-info → docling_core-2.19.0.dist-info}/METADATA +2 -2
- {docling_core-2.18.0.dist-info → docling_core-2.19.0.dist-info}/RECORD +9 -9
- {docling_core-2.18.0.dist-info → docling_core-2.19.0.dist-info}/LICENSE +0 -0
- {docling_core-2.18.0.dist-info → docling_core-2.19.0.dist-info}/WHEEL +0 -0
- {docling_core-2.18.0.dist-info → docling_core-2.19.0.dist-info}/entry_points.txt +0 -0
|
@@ -19,6 +19,7 @@ from docling_core.search.package import VERSION_PATTERN
|
|
|
19
19
|
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
|
|
20
20
|
from docling_core.types import DoclingDocument as DLDocument
|
|
21
21
|
from docling_core.types.doc.document import (
|
|
22
|
+
CodeItem,
|
|
22
23
|
DocItem,
|
|
23
24
|
DocumentOrigin,
|
|
24
25
|
LevelNumber,
|
|
@@ -199,8 +200,10 @@ class HierarchicalChunker(BaseChunker):
|
|
|
199
200
|
heading_by_level.pop(k, None)
|
|
200
201
|
continue
|
|
201
202
|
|
|
202
|
-
if
|
|
203
|
-
|
|
203
|
+
if (
|
|
204
|
+
isinstance(item, TextItem)
|
|
205
|
+
or ((not self.merge_list_items) and isinstance(item, ListItem))
|
|
206
|
+
or isinstance(item, CodeItem)
|
|
204
207
|
):
|
|
205
208
|
text = item.text
|
|
206
209
|
elif isinstance(item, TableItem):
|
|
@@ -75,6 +75,14 @@ DEFAULT_EXPORT_LABELS = {
|
|
|
75
75
|
DocItemLabel.PAGE_FOOTER,
|
|
76
76
|
}
|
|
77
77
|
|
|
78
|
+
DOCUMENT_TOKENS_EXPORT_LABELS = DEFAULT_EXPORT_LABELS.copy()
|
|
79
|
+
DOCUMENT_TOKENS_EXPORT_LABELS.update(
|
|
80
|
+
[
|
|
81
|
+
DocItemLabel.FOOTNOTE,
|
|
82
|
+
DocItemLabel.CAPTION,
|
|
83
|
+
]
|
|
84
|
+
)
|
|
85
|
+
|
|
78
86
|
|
|
79
87
|
class BasePictureData(BaseModel):
|
|
80
88
|
"""BasePictureData."""
|
|
@@ -564,9 +572,8 @@ class DocItem(
|
|
|
564
572
|
self,
|
|
565
573
|
doc: "DoclingDocument",
|
|
566
574
|
new_line: str,
|
|
567
|
-
xsize: int =
|
|
568
|
-
ysize: int =
|
|
569
|
-
add_page_index: bool = True,
|
|
575
|
+
xsize: int = 500,
|
|
576
|
+
ysize: int = 500,
|
|
570
577
|
) -> str:
|
|
571
578
|
"""Get the location string for the BaseCell."""
|
|
572
579
|
if not len(self.prov):
|
|
@@ -576,17 +583,12 @@ class DocItem(
|
|
|
576
583
|
for prov in self.prov:
|
|
577
584
|
page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
|
|
578
585
|
|
|
579
|
-
page_i = -1
|
|
580
|
-
if add_page_index:
|
|
581
|
-
page_i = prov.page_no
|
|
582
|
-
|
|
583
586
|
loc_str = DocumentToken.get_location(
|
|
584
|
-
bbox=prov.bbox.
|
|
587
|
+
bbox=prov.bbox.to_top_left_origin(page_h).as_tuple(),
|
|
585
588
|
page_w=page_w,
|
|
586
589
|
page_h=page_h,
|
|
587
590
|
xsize=xsize,
|
|
588
591
|
ysize=ysize,
|
|
589
|
-
page_i=page_i,
|
|
590
592
|
)
|
|
591
593
|
location += f"{loc_str}{new_line}"
|
|
592
594
|
|
|
@@ -641,57 +643,40 @@ class TextItem(DocItem):
|
|
|
641
643
|
def export_to_document_tokens(
|
|
642
644
|
self,
|
|
643
645
|
doc: "DoclingDocument",
|
|
644
|
-
new_line: str = "
|
|
645
|
-
xsize: int =
|
|
646
|
-
ysize: int =
|
|
646
|
+
new_line: str = "",
|
|
647
|
+
xsize: int = 500,
|
|
648
|
+
ysize: int = 500,
|
|
647
649
|
add_location: bool = True,
|
|
648
650
|
add_content: bool = True,
|
|
649
|
-
add_page_index: bool = True,
|
|
650
651
|
):
|
|
651
652
|
r"""Export text element to document tokens format.
|
|
652
653
|
|
|
653
654
|
:param doc: "DoclingDocument":
|
|
654
|
-
:param new_line: str
|
|
655
|
-
:param xsize: int: (Default value =
|
|
656
|
-
:param ysize: int: (Default value =
|
|
655
|
+
:param new_line: str (Default value = "")
|
|
656
|
+
:param xsize: int: (Default value = 500)
|
|
657
|
+
:param ysize: int: (Default value = 500)
|
|
657
658
|
:param add_location: bool: (Default value = True)
|
|
658
659
|
:param add_content: bool: (Default value = True)
|
|
659
|
-
:param add_page_index: bool: (Default value = True)
|
|
660
660
|
|
|
661
661
|
"""
|
|
662
|
-
body = f"<{self.label.value}>"
|
|
663
|
-
|
|
664
|
-
# TODO: This must be done through an explicit mapping.
|
|
665
|
-
# assert DocumentToken.is_known_token(
|
|
666
|
-
# body
|
|
667
|
-
# ), f"failed DocumentToken.is_known_token({body})"
|
|
662
|
+
body = f"<{self.label.value}>{new_line}"
|
|
668
663
|
|
|
669
664
|
if add_location:
|
|
670
665
|
body += self.get_location_tokens(
|
|
671
666
|
doc=doc,
|
|
672
|
-
new_line=
|
|
667
|
+
new_line=new_line,
|
|
673
668
|
xsize=xsize,
|
|
674
669
|
ysize=ysize,
|
|
675
|
-
add_page_index=add_page_index,
|
|
676
670
|
)
|
|
677
671
|
|
|
678
672
|
if add_content and self.text is not None:
|
|
679
|
-
body += self.text.strip()
|
|
673
|
+
body += f"{self.text.strip()}{new_line}"
|
|
680
674
|
|
|
681
|
-
body += f"</{self.label.value}
|
|
675
|
+
body += f"</{self.label.value}>\n"
|
|
682
676
|
|
|
683
677
|
return body
|
|
684
678
|
|
|
685
679
|
|
|
686
|
-
class CodeItem(TextItem):
|
|
687
|
-
"""CodeItem."""
|
|
688
|
-
|
|
689
|
-
label: typing.Literal[DocItemLabel.CODE] = (
|
|
690
|
-
DocItemLabel.CODE # type: ignore[assignment]
|
|
691
|
-
)
|
|
692
|
-
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
|
|
693
|
-
|
|
694
|
-
|
|
695
680
|
class SectionHeaderItem(TextItem):
|
|
696
681
|
"""SectionItem."""
|
|
697
682
|
|
|
@@ -703,25 +688,23 @@ class SectionHeaderItem(TextItem):
|
|
|
703
688
|
def export_to_document_tokens(
|
|
704
689
|
self,
|
|
705
690
|
doc: "DoclingDocument",
|
|
706
|
-
new_line: str = "
|
|
707
|
-
xsize: int =
|
|
708
|
-
ysize: int =
|
|
691
|
+
new_line: str = "",
|
|
692
|
+
xsize: int = 500,
|
|
693
|
+
ysize: int = 500,
|
|
709
694
|
add_location: bool = True,
|
|
710
695
|
add_content: bool = True,
|
|
711
|
-
add_page_index: bool = True,
|
|
712
696
|
):
|
|
713
697
|
r"""Export text element to document tokens format.
|
|
714
698
|
|
|
715
699
|
:param doc: "DoclingDocument":
|
|
716
|
-
:param new_line: str
|
|
717
|
-
:param xsize: int: (Default value =
|
|
718
|
-
:param ysize: int: (Default value =
|
|
700
|
+
:param new_line: str (Default value = "")
|
|
701
|
+
:param xsize: int: (Default value = 500)
|
|
702
|
+
:param ysize: int: (Default value = 500)
|
|
719
703
|
:param add_location: bool: (Default value = True)
|
|
720
704
|
:param add_content: bool: (Default value = True)
|
|
721
|
-
:param add_page_index: bool: (Default value = True)
|
|
722
705
|
|
|
723
706
|
"""
|
|
724
|
-
body = f"<{self.label.value}_level_{self.level}>"
|
|
707
|
+
body = f"<{self.label.value}_level_{self.level}>{new_line}"
|
|
725
708
|
|
|
726
709
|
# TODO: This must be done through an explicit mapping.
|
|
727
710
|
# assert DocumentToken.is_known_token(
|
|
@@ -731,16 +714,15 @@ class SectionHeaderItem(TextItem):
|
|
|
731
714
|
if add_location:
|
|
732
715
|
body += self.get_location_tokens(
|
|
733
716
|
doc=doc,
|
|
734
|
-
new_line=
|
|
717
|
+
new_line=new_line,
|
|
735
718
|
xsize=xsize,
|
|
736
719
|
ysize=ysize,
|
|
737
|
-
add_page_index=add_page_index,
|
|
738
720
|
)
|
|
739
721
|
|
|
740
722
|
if add_content and self.text is not None:
|
|
741
|
-
body += self.text.strip()
|
|
723
|
+
body += f"{self.text.strip()}{new_line}"
|
|
742
724
|
|
|
743
|
-
body += f"</{self.label.value}_level_{self.level}
|
|
725
|
+
body += f"</{self.label.value}_level_{self.level}>\n"
|
|
744
726
|
|
|
745
727
|
return body
|
|
746
728
|
|
|
@@ -785,6 +767,51 @@ class FloatingItem(DocItem):
|
|
|
785
767
|
return super().get_image(doc=doc)
|
|
786
768
|
|
|
787
769
|
|
|
770
|
+
class CodeItem(FloatingItem, TextItem):
|
|
771
|
+
"""CodeItem."""
|
|
772
|
+
|
|
773
|
+
label: typing.Literal[DocItemLabel.CODE] = (
|
|
774
|
+
DocItemLabel.CODE # type: ignore[assignment]
|
|
775
|
+
)
|
|
776
|
+
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
|
|
777
|
+
|
|
778
|
+
def export_to_document_tokens(
|
|
779
|
+
self,
|
|
780
|
+
doc: "DoclingDocument",
|
|
781
|
+
new_line: str = "",
|
|
782
|
+
xsize: int = 500,
|
|
783
|
+
ysize: int = 500,
|
|
784
|
+
add_location: bool = True,
|
|
785
|
+
add_content: bool = True,
|
|
786
|
+
):
|
|
787
|
+
r"""Export text element to document tokens format.
|
|
788
|
+
|
|
789
|
+
:param doc: "DoclingDocument":
|
|
790
|
+
:param new_line: str (Default value = "")
|
|
791
|
+
:param xsize: int: (Default value = 500)
|
|
792
|
+
:param ysize: int: (Default value = 500)
|
|
793
|
+
:param add_location: bool: (Default value = True)
|
|
794
|
+
:param add_content: bool: (Default value = True)
|
|
795
|
+
|
|
796
|
+
"""
|
|
797
|
+
body = f"<{self.label.value}{new_line}"
|
|
798
|
+
|
|
799
|
+
if add_location:
|
|
800
|
+
body += self.get_location_tokens(
|
|
801
|
+
doc=doc,
|
|
802
|
+
new_line=new_line,
|
|
803
|
+
xsize=xsize,
|
|
804
|
+
ysize=ysize,
|
|
805
|
+
)
|
|
806
|
+
|
|
807
|
+
if add_content and self.text is not None:
|
|
808
|
+
body += f"<_{self.code_language.value}_>{self.text}{new_line}"
|
|
809
|
+
|
|
810
|
+
body += f"</{self.label.value}\n"
|
|
811
|
+
|
|
812
|
+
return body
|
|
813
|
+
|
|
814
|
+
|
|
788
815
|
class PictureItem(FloatingItem):
|
|
789
816
|
"""PictureItem."""
|
|
790
817
|
|
|
@@ -931,47 +958,62 @@ class PictureItem(FloatingItem):
|
|
|
931
958
|
def export_to_document_tokens(
|
|
932
959
|
self,
|
|
933
960
|
doc: "DoclingDocument",
|
|
934
|
-
new_line: str = "
|
|
935
|
-
xsize: int =
|
|
936
|
-
ysize: int =
|
|
961
|
+
new_line: str = "",
|
|
962
|
+
xsize: int = 500,
|
|
963
|
+
ysize: int = 500,
|
|
937
964
|
add_location: bool = True,
|
|
938
965
|
add_caption: bool = True,
|
|
939
966
|
add_content: bool = True, # not used at the moment
|
|
940
|
-
add_page_index: bool = True,
|
|
941
967
|
):
|
|
942
968
|
r"""Export picture to document tokens format.
|
|
943
969
|
|
|
944
970
|
:param doc: "DoclingDocument":
|
|
945
|
-
:param new_line: str
|
|
946
|
-
:param xsize: int: (Default value =
|
|
947
|
-
:param ysize: int: (Default value =
|
|
971
|
+
:param new_line: str (Default value = "")
|
|
972
|
+
:param xsize: int: (Default value = 500)
|
|
973
|
+
:param ysize: int: (Default value = 500)
|
|
948
974
|
:param add_location: bool: (Default value = True)
|
|
949
975
|
:param add_caption: bool: (Default value = True)
|
|
950
976
|
:param add_content: bool: (Default value = True)
|
|
951
|
-
:param # not used at the
|
|
977
|
+
:param # not used at the moment
|
|
952
978
|
|
|
953
979
|
"""
|
|
954
|
-
body = f"{
|
|
955
|
-
|
|
980
|
+
body = f"<{self.label.value}>{new_line}"
|
|
956
981
|
if add_location:
|
|
957
982
|
body += self.get_location_tokens(
|
|
958
983
|
doc=doc,
|
|
959
984
|
new_line=new_line,
|
|
960
985
|
xsize=xsize,
|
|
961
986
|
ysize=ysize,
|
|
962
|
-
add_page_index=add_page_index,
|
|
963
987
|
)
|
|
964
988
|
|
|
989
|
+
classifications = [
|
|
990
|
+
ann
|
|
991
|
+
for ann in self.annotations
|
|
992
|
+
if isinstance(ann, PictureClassificationData)
|
|
993
|
+
]
|
|
994
|
+
if len(classifications) > 0:
|
|
995
|
+
# ! TODO: currently this code assumes class_name is of type 'str'
|
|
996
|
+
# ! TODO: when it will change to an ENUM --> adapt code
|
|
997
|
+
predicted_class = classifications[0].predicted_classes[0].class_name
|
|
998
|
+
body += DocumentToken.get_picture_classification_token(predicted_class)
|
|
999
|
+
|
|
965
1000
|
if add_caption and len(self.captions):
|
|
966
1001
|
text = self.caption_text(doc)
|
|
967
1002
|
|
|
968
1003
|
if len(text):
|
|
969
|
-
body += f"{
|
|
1004
|
+
body += f"<{DocItemLabel.CAPTION.value}>"
|
|
1005
|
+
for caption in self.captions:
|
|
1006
|
+
body += caption.resolve(doc).get_location_tokens(
|
|
1007
|
+
doc=doc,
|
|
1008
|
+
new_line=new_line,
|
|
1009
|
+
xsize=xsize,
|
|
1010
|
+
ysize=ysize,
|
|
1011
|
+
)
|
|
970
1012
|
body += f"{text.strip()}"
|
|
971
|
-
body += f"{
|
|
1013
|
+
body += f"</{DocItemLabel.CAPTION.value}>"
|
|
972
1014
|
body += f"{new_line}"
|
|
973
1015
|
|
|
974
|
-
body += f"{
|
|
1016
|
+
body += f"</{self.label.value}>\n"
|
|
975
1017
|
|
|
976
1018
|
return body
|
|
977
1019
|
|
|
@@ -1143,8 +1185,8 @@ class TableItem(FloatingItem):
|
|
|
1143
1185
|
doc: "DoclingDocument",
|
|
1144
1186
|
add_cell_location: bool = True,
|
|
1145
1187
|
add_cell_text: bool = True,
|
|
1146
|
-
xsize: int =
|
|
1147
|
-
ysize: int =
|
|
1188
|
+
xsize: int = 500,
|
|
1189
|
+
ysize: int = 500,
|
|
1148
1190
|
) -> str:
|
|
1149
1191
|
"""Export the table as OTSL."""
|
|
1150
1192
|
# Possible OTSL tokens...
|
|
@@ -1194,7 +1236,6 @@ class TableItem(FloatingItem):
|
|
|
1194
1236
|
page_h=page_h,
|
|
1195
1237
|
xsize=xsize,
|
|
1196
1238
|
ysize=ysize,
|
|
1197
|
-
page_i=page_no,
|
|
1198
1239
|
)
|
|
1199
1240
|
|
|
1200
1241
|
if rowstart == i and colstart == j:
|
|
@@ -1234,33 +1275,29 @@ class TableItem(FloatingItem):
|
|
|
1234
1275
|
def export_to_document_tokens(
|
|
1235
1276
|
self,
|
|
1236
1277
|
doc: "DoclingDocument",
|
|
1237
|
-
new_line: str = "
|
|
1238
|
-
xsize: int =
|
|
1239
|
-
ysize: int =
|
|
1278
|
+
new_line: str = "",
|
|
1279
|
+
xsize: int = 500,
|
|
1280
|
+
ysize: int = 500,
|
|
1240
1281
|
add_location: bool = True,
|
|
1241
|
-
add_caption: bool = True,
|
|
1242
|
-
add_content: bool = True,
|
|
1243
1282
|
add_cell_location: bool = True,
|
|
1244
|
-
add_cell_label: bool = True,
|
|
1245
1283
|
add_cell_text: bool = True,
|
|
1246
|
-
|
|
1284
|
+
add_caption: bool = True,
|
|
1247
1285
|
):
|
|
1248
1286
|
r"""Export table to document tokens format.
|
|
1249
1287
|
|
|
1250
1288
|
:param doc: "DoclingDocument":
|
|
1251
|
-
:param new_line: str
|
|
1252
|
-
:param xsize: int: (Default value =
|
|
1253
|
-
:param ysize: int: (Default value =
|
|
1289
|
+
:param new_line: str (Default value = "")
|
|
1290
|
+
:param xsize: int: (Default value = 500)
|
|
1291
|
+
:param ysize: int: (Default value = 500)
|
|
1254
1292
|
:param add_location: bool: (Default value = True)
|
|
1255
|
-
:param add_caption: bool: (Default value = True)
|
|
1256
|
-
:param add_content: bool: (Default value = True)
|
|
1257
1293
|
:param add_cell_location: bool: (Default value = True)
|
|
1258
|
-
:param add_cell_label: bool: (Default value = True)
|
|
1259
1294
|
:param add_cell_text: bool: (Default value = True)
|
|
1260
|
-
:param
|
|
1295
|
+
:param add_caption: bool: (Default value = True)
|
|
1261
1296
|
|
|
1262
1297
|
"""
|
|
1263
|
-
|
|
1298
|
+
otsl_tag = DocumentToken.OTSL.value
|
|
1299
|
+
|
|
1300
|
+
body = f"<{otsl_tag}>{new_line}"
|
|
1264
1301
|
|
|
1265
1302
|
if add_location:
|
|
1266
1303
|
body += self.get_location_tokens(
|
|
@@ -1268,76 +1305,27 @@ class TableItem(FloatingItem):
|
|
|
1268
1305
|
new_line=new_line,
|
|
1269
1306
|
xsize=xsize,
|
|
1270
1307
|
ysize=ysize,
|
|
1271
|
-
add_page_index=add_page_index,
|
|
1272
1308
|
)
|
|
1273
1309
|
|
|
1310
|
+
body += self.export_to_otsl(doc, add_cell_location, add_cell_text, xsize, ysize)
|
|
1311
|
+
|
|
1274
1312
|
if add_caption and len(self.captions):
|
|
1275
1313
|
text = self.caption_text(doc)
|
|
1276
1314
|
|
|
1277
1315
|
if len(text):
|
|
1278
|
-
body += f"{
|
|
1316
|
+
body += f"<{DocItemLabel.CAPTION.value}>"
|
|
1317
|
+
for caption in self.captions:
|
|
1318
|
+
body += caption.resolve(doc).get_location_tokens(
|
|
1319
|
+
doc=doc,
|
|
1320
|
+
new_line=new_line,
|
|
1321
|
+
xsize=xsize,
|
|
1322
|
+
ysize=ysize,
|
|
1323
|
+
)
|
|
1279
1324
|
body += f"{text.strip()}"
|
|
1280
|
-
body += f"{
|
|
1325
|
+
body += f"</{DocItemLabel.CAPTION.value}>"
|
|
1281
1326
|
body += f"{new_line}"
|
|
1282
1327
|
|
|
1283
|
-
|
|
1284
|
-
for i, row in enumerate(self.data.grid):
|
|
1285
|
-
body += f"<row_{i}>"
|
|
1286
|
-
for j, col in enumerate(row):
|
|
1287
|
-
|
|
1288
|
-
text = ""
|
|
1289
|
-
if add_cell_text:
|
|
1290
|
-
text = col.text.strip()
|
|
1291
|
-
|
|
1292
|
-
cell_loc = ""
|
|
1293
|
-
if (
|
|
1294
|
-
col.bbox is not None
|
|
1295
|
-
and add_cell_location
|
|
1296
|
-
and add_page_index
|
|
1297
|
-
and len(self.prov) > 0
|
|
1298
|
-
):
|
|
1299
|
-
page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
|
|
1300
|
-
cell_loc = DocumentToken.get_location(
|
|
1301
|
-
bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
1302
|
-
page_w=page_w,
|
|
1303
|
-
page_h=page_h,
|
|
1304
|
-
xsize=xsize,
|
|
1305
|
-
ysize=ysize,
|
|
1306
|
-
page_i=self.prov[0].page_no,
|
|
1307
|
-
)
|
|
1308
|
-
elif (
|
|
1309
|
-
col.bbox is not None
|
|
1310
|
-
and add_cell_location
|
|
1311
|
-
and not add_page_index
|
|
1312
|
-
and len(self.prov) > 0
|
|
1313
|
-
):
|
|
1314
|
-
page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
|
|
1315
|
-
|
|
1316
|
-
cell_loc = DocumentToken.get_location(
|
|
1317
|
-
bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
1318
|
-
page_w=page_w,
|
|
1319
|
-
page_h=page_h,
|
|
1320
|
-
xsize=xsize,
|
|
1321
|
-
ysize=ysize,
|
|
1322
|
-
page_i=-1,
|
|
1323
|
-
)
|
|
1324
|
-
|
|
1325
|
-
cell_label = ""
|
|
1326
|
-
if add_cell_label:
|
|
1327
|
-
if col.column_header:
|
|
1328
|
-
cell_label = "<col_header>"
|
|
1329
|
-
elif col.row_header:
|
|
1330
|
-
cell_label = "<row_header>"
|
|
1331
|
-
elif col.row_section:
|
|
1332
|
-
cell_label = "<row_section>"
|
|
1333
|
-
else:
|
|
1334
|
-
cell_label = "<body>"
|
|
1335
|
-
|
|
1336
|
-
body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
|
|
1337
|
-
|
|
1338
|
-
body += f"</row_{i}>{new_line}"
|
|
1339
|
-
|
|
1340
|
-
body += f"{DocumentToken.END_TABLE.value}{new_line}"
|
|
1328
|
+
body += f"</{otsl_tag}>\n"
|
|
1341
1329
|
|
|
1342
1330
|
return body
|
|
1343
1331
|
|
|
@@ -1777,6 +1765,7 @@ class DoclingDocument(BaseModel):
|
|
|
1777
1765
|
text: str,
|
|
1778
1766
|
code_language: Optional[CodeLanguageLabel] = None,
|
|
1779
1767
|
orig: Optional[str] = None,
|
|
1768
|
+
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
1780
1769
|
prov: Optional[ProvenanceItem] = None,
|
|
1781
1770
|
parent: Optional[NodeItem] = None,
|
|
1782
1771
|
content_layer: Optional[ContentLayer] = None,
|
|
@@ -1786,6 +1775,8 @@ class DoclingDocument(BaseModel):
|
|
|
1786
1775
|
:param text: str:
|
|
1787
1776
|
:param code_language: Optional[str]: (Default value = None)
|
|
1788
1777
|
:param orig: Optional[str]: (Default value = None)
|
|
1778
|
+
:param caption: Optional[Union[TextItem:
|
|
1779
|
+
:param RefItem]]: (Default value = None)
|
|
1789
1780
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1790
1781
|
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1791
1782
|
"""
|
|
@@ -1809,6 +1800,8 @@ class DoclingDocument(BaseModel):
|
|
|
1809
1800
|
code_item.content_layer = content_layer
|
|
1810
1801
|
if prov:
|
|
1811
1802
|
code_item.prov.append(prov)
|
|
1803
|
+
if caption:
|
|
1804
|
+
code_item.captions.append(caption.get_ref())
|
|
1812
1805
|
|
|
1813
1806
|
self.texts.append(code_item)
|
|
1814
1807
|
parent.children.append(RefItem(cref=cref))
|
|
@@ -1927,6 +1920,7 @@ class DoclingDocument(BaseModel):
|
|
|
1927
1920
|
traverse_pictures=traverse_pictures,
|
|
1928
1921
|
page_no=page_no,
|
|
1929
1922
|
_level=_level + 1,
|
|
1923
|
+
included_content_layers=included_content_layers,
|
|
1930
1924
|
)
|
|
1931
1925
|
|
|
1932
1926
|
def _clear_picture_pil_cache(self):
|
|
@@ -2708,22 +2702,18 @@ class DoclingDocument(BaseModel):
|
|
|
2708
2702
|
def save_as_document_tokens(
|
|
2709
2703
|
self,
|
|
2710
2704
|
filename: Path,
|
|
2711
|
-
delim: str = "
|
|
2705
|
+
delim: str = "",
|
|
2712
2706
|
from_element: int = 0,
|
|
2713
2707
|
to_element: int = sys.maxsize,
|
|
2714
|
-
labels: set[DocItemLabel] =
|
|
2715
|
-
xsize: int =
|
|
2716
|
-
ysize: int =
|
|
2708
|
+
labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
2709
|
+
xsize: int = 500,
|
|
2710
|
+
ysize: int = 500,
|
|
2717
2711
|
add_location: bool = True,
|
|
2718
2712
|
add_content: bool = True,
|
|
2719
2713
|
add_page_index: bool = True,
|
|
2720
2714
|
# table specific flags
|
|
2721
2715
|
add_table_cell_location: bool = False,
|
|
2722
|
-
add_table_cell_label: bool = True,
|
|
2723
2716
|
add_table_cell_text: bool = True,
|
|
2724
|
-
# specifics
|
|
2725
|
-
page_no: Optional[int] = None,
|
|
2726
|
-
with_groups: bool = True,
|
|
2727
2717
|
):
|
|
2728
2718
|
r"""Save the document content to a DocumentToken format."""
|
|
2729
2719
|
out = self.export_to_document_tokens(
|
|
@@ -2738,198 +2728,230 @@ class DoclingDocument(BaseModel):
|
|
|
2738
2728
|
add_page_index=add_page_index,
|
|
2739
2729
|
# table specific flags
|
|
2740
2730
|
add_table_cell_location=add_table_cell_location,
|
|
2741
|
-
add_table_cell_label=add_table_cell_label,
|
|
2742
2731
|
add_table_cell_text=add_table_cell_text,
|
|
2743
|
-
# specifics
|
|
2744
|
-
page_no=page_no,
|
|
2745
|
-
with_groups=with_groups,
|
|
2746
2732
|
)
|
|
2747
2733
|
|
|
2748
2734
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
2749
2735
|
fw.write(out)
|
|
2750
2736
|
|
|
2751
|
-
def export_to_document_tokens(
|
|
2737
|
+
def export_to_document_tokens( # noqa: C901
|
|
2752
2738
|
self,
|
|
2753
|
-
delim: str = "
|
|
2739
|
+
delim: str = "",
|
|
2754
2740
|
from_element: int = 0,
|
|
2755
2741
|
to_element: int = sys.maxsize,
|
|
2756
|
-
labels: set[DocItemLabel] =
|
|
2757
|
-
xsize: int =
|
|
2758
|
-
ysize: int =
|
|
2742
|
+
labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
|
|
2743
|
+
xsize: int = 500,
|
|
2744
|
+
ysize: int = 500,
|
|
2759
2745
|
add_location: bool = True,
|
|
2760
2746
|
add_content: bool = True,
|
|
2761
2747
|
add_page_index: bool = True,
|
|
2762
2748
|
# table specific flags
|
|
2763
2749
|
add_table_cell_location: bool = False,
|
|
2764
|
-
add_table_cell_label: bool = True,
|
|
2765
2750
|
add_table_cell_text: bool = True,
|
|
2766
|
-
# specifics
|
|
2767
|
-
page_no: Optional[int] = None,
|
|
2768
|
-
with_groups: bool = True,
|
|
2769
|
-
newline: bool = True,
|
|
2770
2751
|
) -> str:
|
|
2771
2752
|
r"""Exports the document content to a DocumentToken format.
|
|
2772
2753
|
|
|
2773
2754
|
Operates on a slice of the document's body as defined through arguments
|
|
2774
2755
|
from_element and to_element; defaulting to the whole main_text.
|
|
2775
2756
|
|
|
2776
|
-
:param delim: str: (Default value = "
|
|
2757
|
+
:param delim: str: (Default value = "")
|
|
2777
2758
|
:param from_element: int: (Default value = 0)
|
|
2778
2759
|
:param to_element: Optional[int]: (Default value = None)
|
|
2779
2760
|
:param labels: set[DocItemLabel]
|
|
2780
|
-
:param xsize: int: (Default value =
|
|
2781
|
-
:param ysize: int: (Default value =
|
|
2761
|
+
:param xsize: int: (Default value = 500)
|
|
2762
|
+
:param ysize: int: (Default value = 500)
|
|
2782
2763
|
:param add_location: bool: (Default value = True)
|
|
2783
2764
|
:param add_content: bool: (Default value = True)
|
|
2784
2765
|
:param add_page_index: bool: (Default value = True)
|
|
2785
2766
|
:param # table specific flagsadd_table_cell_location: bool
|
|
2786
|
-
:param add_table_cell_label: bool: (Default value = True)
|
|
2787
2767
|
:param add_table_cell_text: bool: (Default value = True)
|
|
2788
2768
|
:returns: The content of the document formatted as a DocTags string.
|
|
2789
2769
|
:rtype: str
|
|
2790
2770
|
"""
|
|
2791
2771
|
|
|
2792
|
-
def
|
|
2793
|
-
|
|
2794
|
-
|
|
2795
|
-
|
|
2796
|
-
|
|
2797
|
-
|
|
2798
|
-
|
|
2799
|
-
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
while curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2804
|
-
if in_ordered_list[-1]:
|
|
2805
|
-
result += f"</ordered_list>{delim}"
|
|
2772
|
+
def _close_lists(
|
|
2773
|
+
current_level: int,
|
|
2774
|
+
previous_level: int,
|
|
2775
|
+
ordered_list_stack: List[bool],
|
|
2776
|
+
output_parts: List[str],
|
|
2777
|
+
) -> List[bool]:
|
|
2778
|
+
"""Close open list tags until the nesting level matches item's level."""
|
|
2779
|
+
while current_level < previous_level and ordered_list_stack:
|
|
2780
|
+
last_is_ordered = ordered_list_stack.pop()
|
|
2781
|
+
if last_is_ordered:
|
|
2782
|
+
output_parts.append(f"</{DocumentToken.ORDERED_LIST.value}>\n")
|
|
2806
2783
|
else:
|
|
2807
|
-
|
|
2808
|
-
|
|
2809
|
-
|
|
2810
|
-
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
|
|
2814
|
-
|
|
2815
|
-
|
|
2816
|
-
else:
|
|
2817
|
-
delim = ""
|
|
2818
|
-
|
|
2819
|
-
prev_level = 0 # Track the previous item's level
|
|
2820
|
-
|
|
2821
|
-
in_ordered_list: List[bool] = [] # False
|
|
2822
|
-
|
|
2823
|
-
result = f"{DocumentToken.BEG_DOCUMENT.value}{delim}"
|
|
2824
|
-
|
|
2825
|
-
for ix, (item, curr_level) in enumerate(
|
|
2826
|
-
self.iterate_items(self.body, with_groups=True)
|
|
2784
|
+
output_parts.append(f"</{DocumentToken.UNORDERED_LIST.value}>\n")
|
|
2785
|
+
previous_level -= 1
|
|
2786
|
+
return ordered_list_stack
|
|
2787
|
+
|
|
2788
|
+
def _add_page_break_if_needed(
|
|
2789
|
+
output_parts: List[str],
|
|
2790
|
+
item,
|
|
2791
|
+
prev_page_no,
|
|
2792
|
+
page_break_enabled: bool,
|
|
2827
2793
|
):
|
|
2828
|
-
|
|
2829
|
-
|
|
2830
|
-
if
|
|
2831
|
-
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
2835
|
-
|
|
2836
|
-
|
|
2837
|
-
|
|
2838
|
-
|
|
2839
|
-
|
|
2840
|
-
|
|
2841
|
-
|
|
2794
|
+
"""Inserts a page-break token.
|
|
2795
|
+
|
|
2796
|
+
Inserts a page-break token if the item's page number is different
|
|
2797
|
+
from the previous item and page breaks are enabled.
|
|
2798
|
+
Returns the updated output_parts list and the current page number.
|
|
2799
|
+
"""
|
|
2800
|
+
if not page_break_enabled:
|
|
2801
|
+
return output_parts, prev_page_no
|
|
2802
|
+
|
|
2803
|
+
if not item.prov:
|
|
2804
|
+
return output_parts, prev_page_no
|
|
2805
|
+
|
|
2806
|
+
current_page_no = item.prov[0].page_no
|
|
2807
|
+
if prev_page_no is None:
|
|
2808
|
+
return output_parts, current_page_no
|
|
2809
|
+
|
|
2810
|
+
if current_page_no != prev_page_no:
|
|
2811
|
+
output_parts.append(f"<{DocumentToken.PAGE_BREAK.value}>\n")
|
|
2812
|
+
|
|
2813
|
+
return output_parts, current_page_no
|
|
2814
|
+
|
|
2815
|
+
def _get_standalone_captions(document_body):
|
|
2816
|
+
"""Identify captions that are not attached to any table or figure."""
|
|
2817
|
+
all_captions = set()
|
|
2818
|
+
matched_captions = set()
|
|
2819
|
+
for item, _ in self.iterate_items(document_body, with_groups=True):
|
|
2820
|
+
if item.label == DocItemLabel.CAPTION:
|
|
2821
|
+
all_captions.update([item.self_ref])
|
|
2822
|
+
if item.label in [DocItemLabel.PICTURE, DocItemLabel.TABLE]:
|
|
2823
|
+
matched_captions.update([caption.cref for caption in item.captions])
|
|
2824
|
+
|
|
2825
|
+
return all_captions - matched_captions
|
|
2826
|
+
|
|
2827
|
+
# Initialization
|
|
2828
|
+
output_parts: List[str] = []
|
|
2829
|
+
ordered_list_stack: List[bool] = []
|
|
2830
|
+
previous_level = 0
|
|
2831
|
+
previous_page_no = None
|
|
2832
|
+
|
|
2833
|
+
# Precompute standalone captions
|
|
2834
|
+
standalone_captions = _get_standalone_captions(self.body)
|
|
2835
|
+
|
|
2836
|
+
# Begin document
|
|
2837
|
+
output_parts.append(f"<{DocumentToken.DOCUMENT.value}>{delim}")
|
|
2838
|
+
|
|
2839
|
+
for ix, (item, current_level) in enumerate(
|
|
2840
|
+
self.iterate_items(
|
|
2841
|
+
self.body,
|
|
2842
|
+
with_groups=True,
|
|
2843
|
+
included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE},
|
|
2844
|
+
)
|
|
2845
|
+
):
|
|
2846
|
+
# Close lists if we've moved to a lower nesting level
|
|
2847
|
+
if current_level < previous_level and ordered_list_stack:
|
|
2848
|
+
ordered_list_stack = _close_lists(
|
|
2849
|
+
current_level, previous_level, ordered_list_stack, output_parts
|
|
2842
2850
|
)
|
|
2851
|
+
previous_level = current_level
|
|
2843
2852
|
|
|
2844
|
-
|
|
2845
|
-
|
|
2846
|
-
|
|
2847
|
-
continue # skip as many items as you want
|
|
2848
|
-
|
|
2849
|
-
if (isinstance(item, DocItem)) and (item.label not in labels):
|
|
2850
|
-
continue # skip any label that is not whitelisted
|
|
2851
|
-
|
|
2852
|
-
if isinstance(item, GroupItem) and item.label in [
|
|
2853
|
-
GroupLabel.ORDERED_LIST,
|
|
2854
|
-
]:
|
|
2853
|
+
# Skip items outside the specified element range
|
|
2854
|
+
if ix < from_element or ix >= to_element:
|
|
2855
|
+
continue
|
|
2855
2856
|
|
|
2856
|
-
|
|
2857
|
-
|
|
2857
|
+
# Skip items whose label is not in the allowed set
|
|
2858
|
+
if isinstance(item, DocItem) and (item.label not in labels):
|
|
2859
|
+
continue
|
|
2858
2860
|
|
|
2859
|
-
|
|
2860
|
-
|
|
2861
|
-
|
|
2861
|
+
# Skip captions that are not standalone as they will be included below
|
|
2862
|
+
# by the export functions of Table and Picture
|
|
2863
|
+
if (
|
|
2864
|
+
isinstance(item, TextItem)
|
|
2865
|
+
and item.label == DocItemLabel.CAPTION
|
|
2866
|
+
and item.self_ref not in standalone_captions
|
|
2867
|
+
):
|
|
2868
|
+
continue
|
|
2862
2869
|
|
|
2863
|
-
|
|
2864
|
-
|
|
2870
|
+
# Handle list groups
|
|
2871
|
+
if isinstance(item, GroupItem):
|
|
2872
|
+
if item.label == GroupLabel.ORDERED_LIST:
|
|
2873
|
+
output_parts.append(f"<{DocumentToken.ORDERED_LIST.value}>{delim}")
|
|
2874
|
+
ordered_list_stack.append(True)
|
|
2875
|
+
elif item.label == GroupLabel.LIST:
|
|
2876
|
+
output_parts.append(
|
|
2877
|
+
f"<{DocumentToken.UNORDERED_LIST.value}>{delim}"
|
|
2878
|
+
)
|
|
2879
|
+
ordered_list_stack.append(False)
|
|
2880
|
+
continue
|
|
2865
2881
|
|
|
2866
|
-
|
|
2882
|
+
# For other item types, optionally insert page-break if the page changed
|
|
2883
|
+
output_parts, previous_page_no = _add_page_break_if_needed(
|
|
2884
|
+
output_parts, item, previous_page_no, add_page_index
|
|
2885
|
+
)
|
|
2867
2886
|
|
|
2868
|
-
|
|
2869
|
-
|
|
2870
|
-
|
|
2871
|
-
|
|
2872
|
-
|
|
2873
|
-
|
|
2874
|
-
|
|
2875
|
-
|
|
2887
|
+
if isinstance(item, SectionHeaderItem):
|
|
2888
|
+
output_parts.append(
|
|
2889
|
+
item.export_to_document_tokens(
|
|
2890
|
+
doc=self,
|
|
2891
|
+
new_line=delim,
|
|
2892
|
+
xsize=xsize,
|
|
2893
|
+
ysize=ysize,
|
|
2894
|
+
add_location=add_location,
|
|
2895
|
+
add_content=add_content,
|
|
2896
|
+
)
|
|
2876
2897
|
)
|
|
2877
|
-
elif isinstance(item, CodeItem)
|
|
2878
|
-
|
|
2879
|
-
|
|
2880
|
-
|
|
2881
|
-
|
|
2882
|
-
|
|
2883
|
-
|
|
2884
|
-
|
|
2885
|
-
|
|
2886
|
-
|
|
2898
|
+
elif isinstance(item, CodeItem):
|
|
2899
|
+
output_parts.append(
|
|
2900
|
+
item.export_to_document_tokens(
|
|
2901
|
+
doc=self,
|
|
2902
|
+
new_line=delim,
|
|
2903
|
+
xsize=xsize,
|
|
2904
|
+
ysize=ysize,
|
|
2905
|
+
add_location=add_location,
|
|
2906
|
+
add_content=add_content,
|
|
2907
|
+
)
|
|
2887
2908
|
)
|
|
2888
|
-
|
|
2889
|
-
|
|
2890
|
-
|
|
2891
|
-
|
|
2892
|
-
|
|
2893
|
-
|
|
2894
|
-
|
|
2895
|
-
|
|
2896
|
-
|
|
2897
|
-
|
|
2898
|
-
add_page_index=add_page_index,
|
|
2909
|
+
elif isinstance(item, TextItem):
|
|
2910
|
+
output_parts.append(
|
|
2911
|
+
item.export_to_document_tokens(
|
|
2912
|
+
doc=self,
|
|
2913
|
+
new_line=delim,
|
|
2914
|
+
xsize=xsize,
|
|
2915
|
+
ysize=ysize,
|
|
2916
|
+
add_location=add_location,
|
|
2917
|
+
add_content=add_content,
|
|
2918
|
+
)
|
|
2899
2919
|
)
|
|
2900
|
-
|
|
2901
|
-
|
|
2902
|
-
|
|
2903
|
-
|
|
2904
|
-
|
|
2905
|
-
|
|
2906
|
-
|
|
2907
|
-
|
|
2908
|
-
|
|
2909
|
-
|
|
2910
|
-
|
|
2911
|
-
|
|
2912
|
-
add_cell_label=add_table_cell_label,
|
|
2913
|
-
add_cell_text=add_table_cell_text,
|
|
2914
|
-
add_page_index=add_page_index,
|
|
2920
|
+
elif isinstance(item, TableItem):
|
|
2921
|
+
output_parts.append(
|
|
2922
|
+
item.export_to_document_tokens(
|
|
2923
|
+
doc=self,
|
|
2924
|
+
new_line=delim,
|
|
2925
|
+
xsize=xsize,
|
|
2926
|
+
ysize=ysize,
|
|
2927
|
+
add_location=add_location,
|
|
2928
|
+
add_cell_location=add_table_cell_location,
|
|
2929
|
+
add_cell_text=add_table_cell_text,
|
|
2930
|
+
add_caption=True,
|
|
2931
|
+
)
|
|
2915
2932
|
)
|
|
2916
|
-
|
|
2917
|
-
|
|
2918
|
-
|
|
2919
|
-
|
|
2920
|
-
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
add_page_index=add_page_index,
|
|
2933
|
+
elif isinstance(item, PictureItem):
|
|
2934
|
+
output_parts.append(
|
|
2935
|
+
item.export_to_document_tokens(
|
|
2936
|
+
doc=self,
|
|
2937
|
+
new_line=delim,
|
|
2938
|
+
xsize=xsize,
|
|
2939
|
+
ysize=ysize,
|
|
2940
|
+
add_caption=True,
|
|
2941
|
+
add_location=add_location,
|
|
2942
|
+
add_content=add_content,
|
|
2943
|
+
)
|
|
2928
2944
|
)
|
|
2929
2945
|
|
|
2930
|
-
|
|
2946
|
+
# End any lists that might still be open
|
|
2947
|
+
ordered_list_stack = _close_lists(
|
|
2948
|
+
0, previous_level, ordered_list_stack, output_parts
|
|
2949
|
+
)
|
|
2931
2950
|
|
|
2932
|
-
|
|
2951
|
+
# End document
|
|
2952
|
+
output_parts.append(f"</{DocumentToken.DOCUMENT.value}>")
|
|
2953
|
+
|
|
2954
|
+
return "".join(output_parts)
|
|
2933
2955
|
|
|
2934
2956
|
def _export_to_indented_text(
|
|
2935
2957
|
self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
|
docling_core/types/doc/labels.py
CHANGED
docling_core/types/doc/tokens.py
CHANGED
|
@@ -8,13 +8,15 @@
|
|
|
8
8
|
from enum import Enum
|
|
9
9
|
from typing import Tuple
|
|
10
10
|
|
|
11
|
+
from docling_core.types.doc.labels import PictureClassificationLabel
|
|
12
|
+
|
|
11
13
|
|
|
12
14
|
class TableToken(Enum):
|
|
13
15
|
"""Class to represent an LLM friendly representation of a Table."""
|
|
14
16
|
|
|
15
17
|
CELL_LABEL_COLUMN_HEADER = "<column_header>"
|
|
16
18
|
CELL_LABEL_ROW_HEADER = "<row_header>"
|
|
17
|
-
|
|
19
|
+
CELL_LABEL_SECTION_HEADER = "<shed>"
|
|
18
20
|
CELL_LABEL_DATA = "<data>"
|
|
19
21
|
|
|
20
22
|
OTSL_ECEL = "<ecel>" # empty cell
|
|
@@ -42,83 +44,30 @@ class TableToken(Enum):
|
|
|
42
44
|
class DocumentToken(Enum):
|
|
43
45
|
"""Class to represent an LLM friendly representation of a Document."""
|
|
44
46
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
BEG_ABSTRACT = "<abstract>"
|
|
52
|
-
END_ABSTRACT = "</abstract>"
|
|
53
|
-
|
|
54
|
-
BEG_DOI = "<doi>"
|
|
55
|
-
END_DOI = "</doi>"
|
|
56
|
-
BEG_DATE = "<date>"
|
|
57
|
-
END_DATE = "</date>"
|
|
58
|
-
|
|
59
|
-
BEG_AUTHORS = "<authors>"
|
|
60
|
-
END_AUTHORS = "</authors>"
|
|
61
|
-
BEG_AUTHOR = "<author>"
|
|
62
|
-
END_AUTHOR = "</author>"
|
|
63
|
-
|
|
64
|
-
BEG_AFFILIATIONS = "<affiliations>"
|
|
65
|
-
END_AFFILIATIONS = "</affiliations>"
|
|
66
|
-
BEG_AFFILIATION = "<affiliation>"
|
|
67
|
-
END_AFFILIATION = "</affiliation>"
|
|
68
|
-
|
|
69
|
-
BEG_HEADER = "<section-header>"
|
|
70
|
-
END_HEADER = "</section-header>"
|
|
71
|
-
BEG_TEXT = "<text>"
|
|
72
|
-
END_TEXT = "</text>"
|
|
73
|
-
BEG_PARAGRAPH = "<paragraph>"
|
|
74
|
-
END_PARAGRAPH = "</paragraph>"
|
|
75
|
-
BEG_TABLE = "<table>"
|
|
76
|
-
END_TABLE = "</table>"
|
|
77
|
-
BEG_FIGURE = "<figure>"
|
|
78
|
-
END_FIGURE = "</figure>"
|
|
79
|
-
BEG_CAPTION = "<caption>"
|
|
80
|
-
END_CAPTION = "</caption>"
|
|
81
|
-
BEG_EQUATION = "<equation>"
|
|
82
|
-
END_EQUATION = "</equation>"
|
|
83
|
-
BEG_LIST = "<list>"
|
|
84
|
-
END_LIST = "</list>"
|
|
85
|
-
BEG_LISTITEM = "<list-item>"
|
|
86
|
-
END_LISTITEM = "</list-item>"
|
|
87
|
-
|
|
88
|
-
BEG_LOCATION = "<location>"
|
|
89
|
-
END_LOCATION = "</location>"
|
|
90
|
-
BEG_GROUP = "<group>"
|
|
91
|
-
END_GROUP = "</group>"
|
|
47
|
+
DOCUMENT = "doctag"
|
|
48
|
+
OTSL = "otsl"
|
|
49
|
+
ORDERED_LIST = "ordered_list"
|
|
50
|
+
UNORDERED_LIST = "unordered_list"
|
|
51
|
+
LOC = "loc_"
|
|
52
|
+
PAGE_BREAK = "page_break"
|
|
92
53
|
|
|
93
54
|
@classmethod
|
|
94
55
|
def get_special_tokens(
|
|
95
56
|
cls,
|
|
96
|
-
max_rows: int = 100,
|
|
97
|
-
max_cols: int = 100,
|
|
98
|
-
max_pages: int = 1000,
|
|
99
57
|
page_dimension: Tuple[int, int] = (100, 100),
|
|
100
58
|
):
|
|
101
59
|
"""Function to get all special document tokens."""
|
|
102
60
|
special_tokens = [token.value for token in cls]
|
|
103
61
|
|
|
104
|
-
# Adding dynamically generated row and col tokens
|
|
105
|
-
for i in range(0, max_rows + 1):
|
|
106
|
-
special_tokens += [f"<row_{i}>", f"</row_{i}>"]
|
|
107
|
-
|
|
108
|
-
for i in range(0, max_cols + 1):
|
|
109
|
-
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
|
|
110
|
-
|
|
111
|
-
for i in range(6):
|
|
112
|
-
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
|
|
113
|
-
|
|
114
|
-
# FIXME: this is synonym of section header
|
|
115
62
|
for i in range(6):
|
|
116
|
-
special_tokens += [
|
|
63
|
+
special_tokens += [
|
|
64
|
+
f"<section_header_level_{i}>",
|
|
65
|
+
f"</section_header_level_{i}>",
|
|
66
|
+
]
|
|
117
67
|
|
|
118
|
-
#
|
|
119
|
-
for
|
|
120
|
-
special_tokens.append(f"<
|
|
121
|
-
special_tokens.append(f"</page_{i}>")
|
|
68
|
+
# Add dynamically picture classification tokens
|
|
69
|
+
for _, member in PictureClassificationLabel.__members__.items():
|
|
70
|
+
special_tokens.append(f"<{member}>")
|
|
122
71
|
|
|
123
72
|
# Adding dynamically generated location-tokens
|
|
124
73
|
for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
|
|
@@ -132,25 +81,9 @@ class DocumentToken(Enum):
|
|
|
132
81
|
return label in DocumentToken.get_special_tokens()
|
|
133
82
|
|
|
134
83
|
@staticmethod
|
|
135
|
-
def
|
|
136
|
-
"""Function to get
|
|
137
|
-
|
|
138
|
-
return f"<row_{row}>"
|
|
139
|
-
else:
|
|
140
|
-
return f"</row_{row}>"
|
|
141
|
-
|
|
142
|
-
@staticmethod
|
|
143
|
-
def get_col_token(col: int, beg=bool) -> str:
|
|
144
|
-
"""Function to get page tokens."""
|
|
145
|
-
if beg:
|
|
146
|
-
return f"<col_{col}>"
|
|
147
|
-
else:
|
|
148
|
-
return f"</col_{col}>"
|
|
149
|
-
|
|
150
|
-
@staticmethod
|
|
151
|
-
def get_page_token(page: int):
|
|
152
|
-
"""Function to get page tokens."""
|
|
153
|
-
return f"<page_{page}>"
|
|
84
|
+
def get_picture_classification_token(classification: str) -> str:
|
|
85
|
+
"""Function to get picture classification tokens."""
|
|
86
|
+
return f"<{classification}>"
|
|
154
87
|
|
|
155
88
|
@staticmethod
|
|
156
89
|
def get_location_token(val: float, rnorm: int = 100):
|
|
@@ -172,7 +105,6 @@ class DocumentToken(Enum):
|
|
|
172
105
|
page_h: float,
|
|
173
106
|
xsize: int = 100,
|
|
174
107
|
ysize: int = 100,
|
|
175
|
-
page_i: int = -1,
|
|
176
108
|
):
|
|
177
109
|
"""Get the location string give bbox and page-dim."""
|
|
178
110
|
assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
|
|
@@ -183,17 +115,11 @@ class DocumentToken(Enum):
|
|
|
183
115
|
x1 = bbox[2] / page_w
|
|
184
116
|
y1 = bbox[3] / page_h
|
|
185
117
|
|
|
186
|
-
page_tok = ""
|
|
187
|
-
if page_i != -1:
|
|
188
|
-
page_tok = DocumentToken.get_page_token(page=page_i)
|
|
189
|
-
|
|
190
118
|
x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
|
|
191
119
|
y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
|
|
192
120
|
x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
|
|
193
121
|
y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
|
|
194
122
|
|
|
195
|
-
loc_str = f"{
|
|
196
|
-
loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
|
|
197
|
-
loc_str += f"{DocumentToken.END_LOCATION.value}"
|
|
123
|
+
loc_str = f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
|
|
198
124
|
|
|
199
125
|
return loc_str
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.19.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -30,7 +30,7 @@ Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
|
30
30
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
31
31
|
Requires-Dist: latex2mathml (>=3.77.0,<4.0.0)
|
|
32
32
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
33
|
-
Requires-Dist: pillow (>=10.
|
|
33
|
+
Requires-Dist: pillow (>=10.0.0,<12.0.0)
|
|
34
34
|
Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
|
|
35
35
|
Requires-Dist: pyyaml (>=5.1,<7.0.0)
|
|
36
36
|
Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
|
|
@@ -18,15 +18,15 @@ docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75
|
|
|
18
18
|
docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
|
|
19
19
|
docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
|
|
20
20
|
docling_core/transforms/chunker/base.py,sha256=BSWTiFOsF5YaZaZJZY8nwIdOXb9uufJMRIds7LxRNh8,2546
|
|
21
|
-
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=
|
|
21
|
+
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=MStDUDtzFGc6j8v9AkcAnnSHTDxdoiVrp8FTmRdGqU8,8138
|
|
22
22
|
docling_core/transforms/chunker/hybrid_chunker.py,sha256=kokjDdxjc_gygOokQwYFVnHv2NjWTgf9uex8o0ole7w,9876
|
|
23
23
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
24
24
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
25
25
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
26
26
|
docling_core/types/doc/base.py,sha256=lMRNq1DUK7K26L2VNZRqFaItCSZ6m9BdYTVaJA98PZQ,11495
|
|
27
|
-
docling_core/types/doc/document.py,sha256=
|
|
28
|
-
docling_core/types/doc/labels.py,sha256=
|
|
29
|
-
docling_core/types/doc/tokens.py,sha256=
|
|
27
|
+
docling_core/types/doc/document.py,sha256=GGwtTZspuv2Nd9d9kX1qkcHjb0Soxp2WE1l_RkZ2pNw,103687
|
|
28
|
+
docling_core/types/doc/labels.py,sha256=cqH4DGN9lgZns6gOtL5urzZzUPGOjHJ75xQbIKSh_h8,5306
|
|
29
|
+
docling_core/types/doc/tokens.py,sha256=i73PXkmqXCLsQ5SddnJX8L9e_Ub2_K_DYSE-VE8NDq0,3925
|
|
30
30
|
docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
|
|
31
31
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
32
32
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
56
56
|
docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
|
|
57
57
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
58
58
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
59
|
-
docling_core-2.
|
|
60
|
-
docling_core-2.
|
|
61
|
-
docling_core-2.
|
|
62
|
-
docling_core-2.
|
|
63
|
-
docling_core-2.
|
|
59
|
+
docling_core-2.19.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
60
|
+
docling_core-2.19.0.dist-info/METADATA,sha256=inIV9-4qlmmQUldiQyEfm2kUUDCz6vZgKXXpVOzy73s,5803
|
|
61
|
+
docling_core-2.19.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
62
|
+
docling_core-2.19.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
63
|
+
docling_core-2.19.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|