docling-core 2.23.3__py3-none-any.whl → 2.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -50,7 +50,7 @@ from docling_core.types.doc.labels import (
50
50
  GraphLinkLabel,
51
51
  GroupLabel,
52
52
  )
53
- from docling_core.types.doc.tokens import DocumentToken, TableToken
53
+ from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
54
54
  from docling_core.types.doc.utils import (
55
55
  get_html_tag_with_text_direction,
56
56
  get_text_direction,
@@ -79,6 +79,7 @@ DEFAULT_EXPORT_LABELS = {
79
79
  DocItemLabel.REFERENCE,
80
80
  DocItemLabel.PAGE_HEADER,
81
81
  DocItemLabel.PAGE_FOOTER,
82
+ DocItemLabel.KEY_VALUE_REGION,
82
83
  }
83
84
 
84
85
  DOCUMENT_TOKENS_EXPORT_LABELS = DEFAULT_EXPORT_LABELS.copy()
@@ -414,6 +415,7 @@ class DocumentOrigin(BaseModel):
414
415
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
415
416
  "text/asciidoc",
416
417
  "text/markdown",
418
+ "text/csv",
417
419
  ]
418
420
 
419
421
  @field_validator("binary_hash", mode="before")
@@ -643,7 +645,7 @@ class DocItem(
643
645
  def get_location_tokens(
644
646
  self,
645
647
  doc: "DoclingDocument",
646
- new_line: str,
648
+ new_line: str = "", # deprecated
647
649
  xsize: int = 500,
648
650
  ysize: int = 500,
649
651
  ) -> str:
@@ -662,7 +664,7 @@ class DocItem(
662
664
  xsize=xsize,
663
665
  ysize=ysize,
664
666
  )
665
- location += f"{loc_str}{new_line}"
667
+ location += loc_str
666
668
 
667
669
  return location
668
670
 
@@ -722,10 +724,15 @@ class TextItem(DocItem):
722
724
  formatting: Optional[Formatting] = None
723
725
  hyperlink: Optional[Union[AnyUrl, Path]] = None
724
726
 
725
- def export_to_document_tokens(
727
+ @deprecated("Use export_to_doctags() instead.")
728
+ def export_to_document_tokens(self, *args, **kwargs):
729
+ r"""Export to DocTags format."""
730
+ return self.export_to_doctags(*args, **kwargs)
731
+
732
+ def export_to_doctags(
726
733
  self,
727
734
  doc: "DoclingDocument",
728
- new_line: str = "",
735
+ new_line: str = "", # deprecated
729
736
  xsize: int = 500,
730
737
  ysize: int = 500,
731
738
  add_location: bool = True,
@@ -734,29 +741,29 @@ class TextItem(DocItem):
734
741
  r"""Export text element to document tokens format.
735
742
 
736
743
  :param doc: "DoclingDocument":
737
- :param new_line: str (Default value = "")
744
+ :param new_line: str (Default value = "") Deprecated
738
745
  :param xsize: int: (Default value = 500)
739
746
  :param ysize: int: (Default value = 500)
740
747
  :param add_location: bool: (Default value = True)
741
748
  :param add_content: bool: (Default value = True)
742
749
 
743
750
  """
744
- body = f"<{self.label.value}>{new_line}"
751
+ from docling_core.experimental.serializer.doctags import (
752
+ DocTagsDocSerializer,
753
+ DocTagsParams,
754
+ )
745
755
 
746
- if add_location:
747
- body += self.get_location_tokens(
748
- doc=doc,
749
- new_line=new_line,
756
+ serializer = DocTagsDocSerializer(
757
+ doc=doc,
758
+ params=DocTagsParams(
750
759
  xsize=xsize,
751
760
  ysize=ysize,
752
- )
753
-
754
- if add_content and self.text is not None:
755
- body += f"{self.text.strip()}{new_line}"
756
-
757
- body += f"</{self.label.value}>\n"
758
-
759
- return body
761
+ add_location=add_location,
762
+ add_content=add_content,
763
+ ),
764
+ )
765
+ text = serializer.serialize(item=self).text
766
+ return text
760
767
 
761
768
 
762
769
  class TitleItem(TextItem):
@@ -775,10 +782,15 @@ class SectionHeaderItem(TextItem):
775
782
  )
776
783
  level: LevelNumber = 1
777
784
 
778
- def export_to_document_tokens(
785
+ @deprecated("Use export_to_doctags() instead.")
786
+ def export_to_document_tokens(self, *args, **kwargs):
787
+ r"""Export to DocTags format."""
788
+ return self.export_to_doctags(*args, **kwargs)
789
+
790
+ def export_to_doctags(
779
791
  self,
780
792
  doc: "DoclingDocument",
781
- new_line: str = "",
793
+ new_line: str = "", # deprecated
782
794
  xsize: int = 500,
783
795
  ysize: int = 500,
784
796
  add_location: bool = True,
@@ -787,34 +799,29 @@ class SectionHeaderItem(TextItem):
787
799
  r"""Export text element to document tokens format.
788
800
 
789
801
  :param doc: "DoclingDocument":
790
- :param new_line: str (Default value = "")
802
+ :param new_line: str (Default value = "") Deprecated
791
803
  :param xsize: int: (Default value = 500)
792
804
  :param ysize: int: (Default value = 500)
793
805
  :param add_location: bool: (Default value = True)
794
806
  :param add_content: bool: (Default value = True)
795
807
 
796
808
  """
797
- body = f"<{self.label.value}_level_{self.level}>{new_line}"
798
-
799
- # TODO: This must be done through an explicit mapping.
800
- # assert DocumentToken.is_known_token(
801
- # body
802
- # ), f"failed DocumentToken.is_known_token({body})"
809
+ from docling_core.experimental.serializer.doctags import (
810
+ DocTagsDocSerializer,
811
+ DocTagsParams,
812
+ )
803
813
 
804
- if add_location:
805
- body += self.get_location_tokens(
806
- doc=doc,
807
- new_line=new_line,
814
+ serializer = DocTagsDocSerializer(
815
+ doc=doc,
816
+ params=DocTagsParams(
808
817
  xsize=xsize,
809
818
  ysize=ysize,
810
- )
811
-
812
- if add_content and self.text is not None:
813
- body += f"{self.text.strip()}{new_line}"
814
-
815
- body += f"</{self.label.value}_level_{self.level}>\n"
816
-
817
- return body
819
+ add_location=add_location,
820
+ add_content=add_content,
821
+ ),
822
+ )
823
+ text = serializer.serialize(item=self).text
824
+ return text
818
825
 
819
826
 
820
827
  class ListItem(TextItem):
@@ -865,10 +872,15 @@ class CodeItem(FloatingItem, TextItem):
865
872
  )
866
873
  code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
867
874
 
868
- def export_to_document_tokens(
875
+ @deprecated("Use export_to_doctags() instead.")
876
+ def export_to_document_tokens(self, *args, **kwargs):
877
+ r"""Export to DocTags format."""
878
+ return self.export_to_doctags(*args, **kwargs)
879
+
880
+ def export_to_doctags(
869
881
  self,
870
882
  doc: "DoclingDocument",
871
- new_line: str = "",
883
+ new_line: str = "", # deprecated
872
884
  xsize: int = 500,
873
885
  ysize: int = 500,
874
886
  add_location: bool = True,
@@ -877,29 +889,29 @@ class CodeItem(FloatingItem, TextItem):
877
889
  r"""Export text element to document tokens format.
878
890
 
879
891
  :param doc: "DoclingDocument":
880
- :param new_line: str (Default value = "")
892
+ :param new_line: str (Default value = "") Deprecated
881
893
  :param xsize: int: (Default value = 500)
882
894
  :param ysize: int: (Default value = 500)
883
895
  :param add_location: bool: (Default value = True)
884
896
  :param add_content: bool: (Default value = True)
885
897
 
886
898
  """
887
- body = f"<{self.label.value}>{new_line}"
899
+ from docling_core.experimental.serializer.doctags import (
900
+ DocTagsDocSerializer,
901
+ DocTagsParams,
902
+ )
888
903
 
889
- if add_location:
890
- body += self.get_location_tokens(
891
- doc=doc,
892
- new_line=new_line,
904
+ serializer = DocTagsDocSerializer(
905
+ doc=doc,
906
+ params=DocTagsParams(
893
907
  xsize=xsize,
894
908
  ysize=ysize,
895
- )
896
-
897
- if add_content and self.text is not None:
898
- body += f"<_{self.code_language.value}_>{self.text}{new_line}"
899
-
900
- body += f"</{self.label.value}>\n"
901
-
902
- return body
909
+ add_location=add_location,
910
+ add_content=add_content,
911
+ ),
912
+ )
913
+ text = serializer.serialize(item=self).text
914
+ return text
903
915
 
904
916
 
905
917
  class FormulaItem(TextItem):
@@ -953,7 +965,10 @@ class PictureItem(FloatingItem):
953
965
  image_placeholder: str = "<!-- image -->",
954
966
  ) -> str:
955
967
  """Export picture to Markdown format."""
956
- from docling_core.experimental.serializer.markdown import MarkdownDocSerializer
968
+ from docling_core.experimental.serializer.markdown import (
969
+ MarkdownDocSerializer,
970
+ MarkdownParams,
971
+ )
957
972
 
958
973
  if not add_caption:
959
974
  _logger.warning(
@@ -961,20 +976,13 @@ class PictureItem(FloatingItem):
961
976
  )
962
977
 
963
978
  serializer = MarkdownDocSerializer(
964
- doc=self,
965
- image_mode=image_mode,
966
- )
967
- text = (
968
- serializer.picture_serializer.serialize(
969
- item=self,
970
- doc_serializer=serializer,
971
- doc=doc,
979
+ doc=doc,
980
+ params=MarkdownParams(
972
981
  image_mode=image_mode,
973
982
  image_placeholder=image_placeholder,
974
- ).text
975
- if serializer.picture_serializer
976
- else ""
983
+ ),
977
984
  )
985
+ text = serializer.serialize(item=self).text
978
986
  return text
979
987
 
980
988
  def export_to_html(
@@ -1033,10 +1041,15 @@ class PictureItem(FloatingItem):
1033
1041
  else:
1034
1042
  return default_response
1035
1043
 
1036
- def export_to_document_tokens(
1044
+ @deprecated("Use export_to_doctags() instead.")
1045
+ def export_to_document_tokens(self, *args, **kwargs):
1046
+ r"""Export to DocTags format."""
1047
+ return self.export_to_doctags(*args, **kwargs)
1048
+
1049
+ def export_to_doctags(
1037
1050
  self,
1038
1051
  doc: "DoclingDocument",
1039
- new_line: str = "",
1052
+ new_line: str = "", # deprecated
1040
1053
  xsize: int = 500,
1041
1054
  ysize: int = 500,
1042
1055
  add_location: bool = True,
@@ -1046,7 +1059,7 @@ class PictureItem(FloatingItem):
1046
1059
  r"""Export picture to document tokens format.
1047
1060
 
1048
1061
  :param doc: "DoclingDocument":
1049
- :param new_line: str (Default value = "")
1062
+ :param new_line: str (Default value = "") Deprecated
1050
1063
  :param xsize: int: (Default value = 500)
1051
1064
  :param ysize: int: (Default value = 500)
1052
1065
  :param add_location: bool: (Default value = True)
@@ -1055,59 +1068,23 @@ class PictureItem(FloatingItem):
1055
1068
  :param # not used at the moment
1056
1069
 
1057
1070
  """
1058
- body = f"<{self.label.value}>{new_line}"
1059
- if add_location:
1060
- body += self.get_location_tokens(
1061
- doc=doc,
1062
- new_line=new_line,
1071
+ from docling_core.experimental.serializer.doctags import (
1072
+ DocTagsDocSerializer,
1073
+ DocTagsParams,
1074
+ )
1075
+
1076
+ serializer = DocTagsDocSerializer(
1077
+ doc=doc,
1078
+ params=DocTagsParams(
1063
1079
  xsize=xsize,
1064
1080
  ysize=ysize,
1065
- )
1066
-
1067
- classifications = [
1068
- ann
1069
- for ann in self.annotations
1070
- if isinstance(ann, PictureClassificationData)
1071
- ]
1072
- if len(classifications) > 0:
1073
- # ! TODO: currently this code assumes class_name is of type 'str'
1074
- # ! TODO: when it will change to an ENUM --> adapt code
1075
- predicted_class = classifications[0].predicted_classes[0].class_name
1076
- body += DocumentToken.get_picture_classification_token(predicted_class)
1077
-
1078
- smiles_annotations = [
1079
- ann for ann in self.annotations if isinstance(ann, PictureMoleculeData)
1080
- ]
1081
- if len(smiles_annotations) > 0:
1082
- body += (
1083
- "<"
1084
- + DocumentToken.SMILES.value
1085
- + ">"
1086
- + smiles_annotations[0].smi
1087
- + "</"
1088
- + DocumentToken.SMILES.value
1089
- + ">"
1090
- )
1091
-
1092
- if add_caption and len(self.captions):
1093
- text = self.caption_text(doc)
1094
-
1095
- if len(text):
1096
- body += f"<{DocItemLabel.CAPTION.value}>"
1097
- for caption in self.captions:
1098
- body += caption.resolve(doc).get_location_tokens(
1099
- doc=doc,
1100
- new_line=new_line,
1101
- xsize=xsize,
1102
- ysize=ysize,
1103
- )
1104
- body += f"{text.strip()}"
1105
- body += f"</{DocItemLabel.CAPTION.value}>"
1106
- body += f"{new_line}"
1107
-
1108
- body += f"</{self.label.value}>\n"
1109
-
1110
- return body
1081
+ add_location=add_location,
1082
+ add_content=add_content,
1083
+ add_caption=add_caption,
1084
+ ),
1085
+ )
1086
+ text = serializer.serialize(item=self).text
1087
+ return text
1111
1088
 
1112
1089
 
1113
1090
  class TableItem(FloatingItem):
@@ -1171,18 +1148,8 @@ class TableItem(FloatingItem):
1171
1148
  MarkdownDocSerializer,
1172
1149
  )
1173
1150
 
1174
- serializer = MarkdownDocSerializer(
1175
- doc=doc,
1176
- )
1177
- text = (
1178
- serializer.table_serializer.serialize(
1179
- item=self,
1180
- doc_serializer=serializer,
1181
- doc=doc,
1182
- ).text
1183
- if serializer.table_serializer
1184
- else ""
1185
- )
1151
+ serializer = MarkdownDocSerializer(doc=doc)
1152
+ text = serializer.serialize(item=self).text
1186
1153
  return text
1187
1154
  else:
1188
1155
  _logger.warning(
@@ -1391,10 +1358,15 @@ class TableItem(FloatingItem):
1391
1358
  body_str = "".join(body)
1392
1359
  return body_str
1393
1360
 
1394
- def export_to_document_tokens(
1361
+ @deprecated("Use export_to_doctags() instead.")
1362
+ def export_to_document_tokens(self, *args, **kwargs):
1363
+ r"""Export to DocTags format."""
1364
+ return self.export_to_doctags(*args, **kwargs)
1365
+
1366
+ def export_to_doctags(
1395
1367
  self,
1396
1368
  doc: "DoclingDocument",
1397
- new_line: str = "",
1369
+ new_line: str = "", # deprecated
1398
1370
  xsize: int = 500,
1399
1371
  ysize: int = 500,
1400
1372
  add_location: bool = True,
@@ -1405,7 +1377,7 @@ class TableItem(FloatingItem):
1405
1377
  r"""Export table to document tokens format.
1406
1378
 
1407
1379
  :param doc: "DoclingDocument":
1408
- :param new_line: str (Default value = "")
1380
+ :param new_line: str (Default value = "") Deprecated
1409
1381
  :param xsize: int: (Default value = 500)
1410
1382
  :param ysize: int: (Default value = 500)
1411
1383
  :param add_location: bool: (Default value = True)
@@ -1414,39 +1386,24 @@ class TableItem(FloatingItem):
1414
1386
  :param add_caption: bool: (Default value = True)
1415
1387
 
1416
1388
  """
1417
- otsl_tag = DocumentToken.OTSL.value
1418
-
1419
- body = f"<{otsl_tag}>{new_line}"
1389
+ from docling_core.experimental.serializer.doctags import (
1390
+ DocTagsDocSerializer,
1391
+ DocTagsParams,
1392
+ )
1420
1393
 
1421
- if add_location:
1422
- body += self.get_location_tokens(
1423
- doc=doc,
1424
- new_line=new_line,
1394
+ serializer = DocTagsDocSerializer(
1395
+ doc=doc,
1396
+ params=DocTagsParams(
1425
1397
  xsize=xsize,
1426
1398
  ysize=ysize,
1427
- )
1428
-
1429
- body += self.export_to_otsl(doc, add_cell_location, add_cell_text, xsize, ysize)
1430
-
1431
- if add_caption and len(self.captions):
1432
- text = self.caption_text(doc)
1433
-
1434
- if len(text):
1435
- body += f"<{DocItemLabel.CAPTION.value}>"
1436
- for caption in self.captions:
1437
- body += caption.resolve(doc).get_location_tokens(
1438
- doc=doc,
1439
- new_line=new_line,
1440
- xsize=xsize,
1441
- ysize=ysize,
1442
- )
1443
- body += f"{text.strip()}"
1444
- body += f"</{DocItemLabel.CAPTION.value}>"
1445
- body += f"{new_line}"
1446
-
1447
- body += f"</{otsl_tag}>\n"
1448
-
1449
- return body
1399
+ add_location=add_location,
1400
+ add_caption=add_caption,
1401
+ add_table_cell_location=add_cell_location,
1402
+ add_table_cell_text=add_cell_text,
1403
+ ),
1404
+ )
1405
+ text = serializer.serialize(item=self).text
1406
+ return text
1450
1407
 
1451
1408
 
1452
1409
  class GraphCell(BaseModel):
@@ -1508,6 +1465,42 @@ class KeyValueItem(FloatingItem):
1508
1465
 
1509
1466
  graph: GraphData
1510
1467
 
1468
+ def export_to_document_tokens(
1469
+ self,
1470
+ doc: "DoclingDocument",
1471
+ new_line: str = "", # deprecated
1472
+ xsize: int = 500,
1473
+ ysize: int = 500,
1474
+ add_location: bool = True,
1475
+ add_content: bool = True,
1476
+ ):
1477
+ r"""Export key value item to document tokens format.
1478
+
1479
+ :param doc: "DoclingDocument":
1480
+ :param new_line: str (Default value = "") Deprecated
1481
+ :param xsize: int: (Default value = 500)
1482
+ :param ysize: int: (Default value = 500)
1483
+ :param add_location: bool: (Default value = True)
1484
+ :param add_content: bool: (Default value = True)
1485
+
1486
+ """
1487
+ from docling_core.experimental.serializer.doctags import (
1488
+ DocTagsDocSerializer,
1489
+ DocTagsParams,
1490
+ )
1491
+
1492
+ serializer = DocTagsDocSerializer(
1493
+ doc=doc,
1494
+ params=DocTagsParams(
1495
+ xsize=xsize,
1496
+ ysize=ysize,
1497
+ add_location=add_location,
1498
+ add_content=add_content,
1499
+ ),
1500
+ )
1501
+ text = serializer.serialize(item=self).text
1502
+ return text
1503
+
1511
1504
 
1512
1505
  class FormItem(FloatingItem):
1513
1506
  """FormItem."""
@@ -2297,7 +2290,7 @@ class DoclingDocument(BaseModel):
2297
2290
  with_groups: bool = False,
2298
2291
  traverse_pictures: bool = False,
2299
2292
  page_no: Optional[int] = None,
2300
- included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
2293
+ included_content_layers: Optional[set[ContentLayer]] = None,
2301
2294
  _level: int = 0, # fixed parameter, carries through the node nesting level
2302
2295
  ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
2303
2296
  """iterate_elements.
@@ -2310,6 +2303,11 @@ class DoclingDocument(BaseModel):
2310
2303
  :param # fixed parameter:
2311
2304
  :param carries through the node nesting level:
2312
2305
  """
2306
+ my_layers = (
2307
+ included_content_layers
2308
+ if included_content_layers is not None
2309
+ else DEFAULT_CONTENT_LAYERS
2310
+ )
2313
2311
  if not root:
2314
2312
  root = self.body
2315
2313
 
@@ -2325,7 +2323,7 @@ class DoclingDocument(BaseModel):
2325
2323
  or any(prov.page_no == page_no for prov in root.prov)
2326
2324
  )
2327
2325
  )
2328
- and root.content_layer in included_content_layers
2326
+ and root.content_layer in my_layers
2329
2327
  )
2330
2328
 
2331
2329
  if should_yield:
@@ -2345,7 +2343,7 @@ class DoclingDocument(BaseModel):
2345
2343
  traverse_pictures=traverse_pictures,
2346
2344
  page_no=page_no,
2347
2345
  _level=_level + 1,
2348
- included_content_layers=included_content_layers,
2346
+ included_content_layers=my_layers,
2349
2347
  )
2350
2348
 
2351
2349
  def _clear_picture_pil_cache(self):
@@ -2475,12 +2473,14 @@ class DoclingDocument(BaseModel):
2475
2473
 
2476
2474
  def save_as_json(
2477
2475
  self,
2478
- filename: Path,
2476
+ filename: Union[str, Path],
2479
2477
  artifacts_dir: Optional[Path] = None,
2480
2478
  image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
2481
2479
  indent: int = 2,
2482
2480
  ):
2483
2481
  """Save as json."""
2482
+ if isinstance(filename, str):
2483
+ filename = Path(filename)
2484
2484
  artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
2485
2485
 
2486
2486
  if image_mode == ImageRefMode.REFERENCED:
@@ -2495,7 +2495,7 @@ class DoclingDocument(BaseModel):
2495
2495
  json.dump(out, fw, indent=indent)
2496
2496
 
2497
2497
  @classmethod
2498
- def load_from_json(cls, filename: Path) -> "DoclingDocument":
2498
+ def load_from_json(cls, filename: Union[str, Path]) -> "DoclingDocument":
2499
2499
  """load_from_json.
2500
2500
 
2501
2501
  :param filename: The filename to load a saved DoclingDocument from a .json.
@@ -2505,17 +2505,21 @@ class DoclingDocument(BaseModel):
2505
2505
  :rtype: DoclingDocument
2506
2506
 
2507
2507
  """
2508
+ if isinstance(filename, str):
2509
+ filename = Path(filename)
2508
2510
  with open(filename, "r", encoding="utf-8") as f:
2509
2511
  return cls.model_validate_json(f.read())
2510
2512
 
2511
2513
  def save_as_yaml(
2512
2514
  self,
2513
- filename: Path,
2515
+ filename: Union[str, Path],
2514
2516
  artifacts_dir: Optional[Path] = None,
2515
2517
  image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
2516
2518
  default_flow_style: bool = False,
2517
2519
  ):
2518
2520
  """Save as yaml."""
2521
+ if isinstance(filename, str):
2522
+ filename = Path(filename)
2519
2523
  artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
2520
2524
 
2521
2525
  if image_mode == ImageRefMode.REFERENCED:
@@ -2530,7 +2534,7 @@ class DoclingDocument(BaseModel):
2530
2534
  yaml.dump(out, fw, default_flow_style=default_flow_style)
2531
2535
 
2532
2536
  @classmethod
2533
- def load_from_yaml(cls, filename: Path) -> "DoclingDocument":
2537
+ def load_from_yaml(cls, filename: Union[str, Path]) -> "DoclingDocument":
2534
2538
  """load_from_yaml.
2535
2539
 
2536
2540
  Args:
@@ -2539,6 +2543,8 @@ class DoclingDocument(BaseModel):
2539
2543
  Returns:
2540
2544
  DoclingDocument: the loaded DoclingDocument
2541
2545
  """
2546
+ if isinstance(filename, str):
2547
+ filename = Path(filename)
2542
2548
  with open(filename, encoding="utf-8") as f:
2543
2549
  data = yaml.load(f, Loader=yaml.FullLoader)
2544
2550
  return DoclingDocument.model_validate(data)
@@ -2556,12 +2562,12 @@ class DoclingDocument(BaseModel):
2556
2562
 
2557
2563
  def save_as_markdown(
2558
2564
  self,
2559
- filename: Path,
2565
+ filename: Union[str, Path],
2560
2566
  artifacts_dir: Optional[Path] = None,
2561
2567
  delim: str = "\n\n",
2562
2568
  from_element: int = 0,
2563
2569
  to_element: int = sys.maxsize,
2564
- labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
2570
+ labels: Optional[set[DocItemLabel]] = None,
2565
2571
  strict_text: bool = False,
2566
2572
  escaping_underscores: bool = True,
2567
2573
  image_placeholder: str = "<!-- image -->",
@@ -2569,9 +2575,12 @@ class DoclingDocument(BaseModel):
2569
2575
  indent: int = 4,
2570
2576
  text_width: int = -1,
2571
2577
  page_no: Optional[int] = None,
2572
- included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
2578
+ included_content_layers: Optional[set[ContentLayer]] = None,
2579
+ page_break_placeholder: Optional[str] = None,
2573
2580
  ):
2574
2581
  """Save to markdown."""
2582
+ if isinstance(filename, str):
2583
+ filename = Path(filename)
2575
2584
  artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
2576
2585
 
2577
2586
  if image_mode == ImageRefMode.REFERENCED:
@@ -2587,13 +2596,14 @@ class DoclingDocument(BaseModel):
2587
2596
  to_element=to_element,
2588
2597
  labels=labels,
2589
2598
  strict_text=strict_text,
2590
- escaping_underscores=escaping_underscores,
2599
+ escape_underscores=escaping_underscores,
2591
2600
  image_placeholder=image_placeholder,
2592
2601
  image_mode=image_mode,
2593
2602
  indent=indent,
2594
2603
  text_width=text_width,
2595
2604
  page_no=page_no,
2596
2605
  included_content_layers=included_content_layers,
2606
+ page_break_placeholder=page_break_placeholder,
2597
2607
  )
2598
2608
 
2599
2609
  with open(filename, "w", encoding="utf-8") as fw:
@@ -2604,15 +2614,16 @@ class DoclingDocument(BaseModel):
2604
2614
  delim: str = "\n\n",
2605
2615
  from_element: int = 0,
2606
2616
  to_element: int = sys.maxsize,
2607
- labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
2617
+ labels: Optional[set[DocItemLabel]] = None,
2608
2618
  strict_text: bool = False,
2609
- escaping_underscores: bool = True,
2619
+ escape_underscores: bool = True,
2610
2620
  image_placeholder: str = "<!-- image -->",
2611
2621
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2612
2622
  indent: int = 4,
2613
2623
  text_width: int = -1,
2614
2624
  page_no: Optional[int] = None,
2615
- included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
2625
+ included_content_layers: Optional[set[ContentLayer]] = None,
2626
+ page_break_placeholder: Optional[str] = None, # e.g. "<!-- page break -->",
2616
2627
  ) -> str:
2617
2628
  r"""Serialize to Markdown.
2618
2629
 
@@ -2627,8 +2638,9 @@ class DoclingDocument(BaseModel):
2627
2638
  :param to_element: Body slicing stop index
2628
2639
  (exclusive). (Default value = maxint).
2629
2640
  :type to_element: int = sys.maxsize
2630
- :param labels: The set of document labels to include in the export.
2631
- :type labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS
2641
+ :param labels: The set of document labels to include in the export. None falls
2642
+ back to the system-defined default.
2643
+ :type labels: Optional[set[DocItemLabel]] = None
2632
2644
  :param strict_text: Deprecated.
2633
2645
  :type strict_text: bool = False
2634
2646
  :param escaping_underscores: bool: Whether to escape underscores in the
@@ -2643,30 +2655,40 @@ class DoclingDocument(BaseModel):
2643
2655
  :param indent: The indent in spaces of the nested lists.
2644
2656
  (Default value = 4).
2645
2657
  :type indent: int = 4
2658
+ :param included_content_layers: The set of layels to include in the export. None
2659
+ falls back to the system-defined default.
2660
+ :type included_content_layers: Optional[set[ContentLayer]] = None
2661
+ :param page_break_placeholder: The placeholder to include for marking page
2662
+ breaks. None means no page break placeholder will be used.
2663
+ :type page_break_placeholder: Optional[str] = None
2646
2664
  :returns: The exported Markdown representation.
2647
2665
  :rtype: str
2648
2666
  """
2649
2667
  from docling_core.experimental.serializer.markdown import (
2650
2668
  MarkdownDocSerializer,
2651
- MarkdownListSerializer,
2652
- MarkdownTextSerializer,
2669
+ MarkdownParams,
2653
2670
  )
2654
2671
 
2672
+ my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
2673
+ my_layers = (
2674
+ included_content_layers
2675
+ if included_content_layers is not None
2676
+ else DEFAULT_CONTENT_LAYERS
2677
+ )
2655
2678
  serializer = MarkdownDocSerializer(
2656
2679
  doc=self,
2657
- start=from_element,
2658
- stop=to_element,
2659
- image_placeholder=image_placeholder,
2660
- image_mode=image_mode,
2661
- labels=labels,
2662
- layers=included_content_layers,
2663
- pages={page_no} if page_no is not None else None,
2664
- escaping_underscores=escaping_underscores,
2665
- text_serializer=MarkdownTextSerializer(
2666
- wrap_width=text_width if text_width > 0 else None,
2667
- ),
2668
- list_serializer=MarkdownListSerializer(
2680
+ params=MarkdownParams(
2681
+ labels=my_labels,
2682
+ layers=my_layers,
2683
+ pages={page_no} if page_no is not None else None,
2684
+ start_idx=from_element,
2685
+ stop_idx=to_element,
2686
+ escape_underscores=escape_underscores,
2687
+ image_placeholder=image_placeholder,
2688
+ image_mode=image_mode,
2669
2689
  indent=indent,
2690
+ wrap_width=text_width if text_width > 0 else None,
2691
+ page_break_placeholder=page_break_placeholder,
2670
2692
  ),
2671
2693
  )
2672
2694
  ser_res = serializer.serialize()
@@ -2687,34 +2709,38 @@ class DoclingDocument(BaseModel):
2687
2709
  delim: str = "\n\n",
2688
2710
  from_element: int = 0,
2689
2711
  to_element: int = 1000000,
2690
- labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
2712
+ labels: Optional[set[DocItemLabel]] = None,
2691
2713
  ) -> str:
2692
2714
  """export_to_text."""
2715
+ my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
2716
+
2693
2717
  return self.export_to_markdown(
2694
- delim,
2695
- from_element,
2696
- to_element,
2697
- labels,
2718
+ delim=delim,
2719
+ from_element=from_element,
2720
+ to_element=to_element,
2721
+ labels=my_labels,
2698
2722
  strict_text=True,
2699
- escaping_underscores=False,
2723
+ escape_underscores=False,
2700
2724
  image_placeholder="",
2701
2725
  )
2702
2726
 
2703
2727
  def save_as_html(
2704
2728
  self,
2705
- filename: Path,
2729
+ filename: Union[str, Path],
2706
2730
  artifacts_dir: Optional[Path] = None,
2707
2731
  from_element: int = 0,
2708
2732
  to_element: int = sys.maxsize,
2709
- labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2733
+ labels: Optional[set[DocItemLabel]] = None,
2710
2734
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2711
2735
  formula_to_mathml: bool = True,
2712
2736
  page_no: Optional[int] = None,
2713
2737
  html_lang: str = "en",
2714
2738
  html_head: str = _HTML_DEFAULT_HEAD,
2715
- included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
2739
+ included_content_layers: Optional[set[ContentLayer]] = None,
2716
2740
  ):
2717
2741
  """Save to HTML."""
2742
+ if isinstance(filename, str):
2743
+ filename = Path(filename)
2718
2744
  artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
2719
2745
 
2720
2746
  if image_mode == ImageRefMode.REFERENCED:
@@ -2740,8 +2766,10 @@ class DoclingDocument(BaseModel):
2740
2766
  fw.write(html_out)
2741
2767
 
2742
2768
  def _get_output_paths(
2743
- self, filename: Path, artifacts_dir: Optional[Path] = None
2769
+ self, filename: Union[str, Path], artifacts_dir: Optional[Path] = None
2744
2770
  ) -> Tuple[Path, Optional[Path]]:
2771
+ if isinstance(filename, str):
2772
+ filename = Path(filename)
2745
2773
  if artifacts_dir is None:
2746
2774
  # Remove the extension and add '_pictures'
2747
2775
  artifacts_dir = filename.with_suffix("")
@@ -2775,15 +2803,21 @@ class DoclingDocument(BaseModel):
2775
2803
  self,
2776
2804
  from_element: int = 0,
2777
2805
  to_element: int = sys.maxsize,
2778
- labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2806
+ labels: Optional[set[DocItemLabel]] = None,
2779
2807
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2780
2808
  formula_to_mathml: bool = True,
2781
2809
  page_no: Optional[int] = None,
2782
2810
  html_lang: str = "en",
2783
2811
  html_head: str = _HTML_DEFAULT_HEAD,
2784
- included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
2812
+ included_content_layers: Optional[set[ContentLayer]] = None,
2785
2813
  ) -> str:
2786
2814
  r"""Serialize to HTML."""
2815
+ my_labels = labels if labels is not None else DEFAULT_EXPORT_LABELS
2816
+ my_layers = (
2817
+ included_content_layers
2818
+ if included_content_layers is not None
2819
+ else DEFAULT_CONTENT_LAYERS
2820
+ )
2787
2821
 
2788
2822
  def close_lists(
2789
2823
  curr_level: int,
@@ -2831,7 +2865,7 @@ class DoclingDocument(BaseModel):
2831
2865
  self.body,
2832
2866
  with_groups=True,
2833
2867
  page_no=page_no,
2834
- included_content_layers=included_content_layers,
2868
+ included_content_layers=my_layers,
2835
2869
  )
2836
2870
  ):
2837
2871
  # If we've moved to a lower level, we're exiting one or more groups
@@ -2853,7 +2887,7 @@ class DoclingDocument(BaseModel):
2853
2887
  if ix < from_element or to_element <= ix:
2854
2888
  continue # skip as many items as you want
2855
2889
 
2856
- if (isinstance(item, DocItem)) and (item.label not in labels):
2890
+ if (isinstance(item, DocItem)) and (item.label not in my_labels):
2857
2891
  continue # skip any label that is not whitelisted
2858
2892
 
2859
2893
  if isinstance(item, GroupItem) and item.label in [
@@ -3000,7 +3034,7 @@ class DoclingDocument(BaseModel):
3000
3034
  )
3001
3035
  )
3002
3036
 
3003
- elif isinstance(item, DocItem) and item.label in labels:
3037
+ elif isinstance(item, DocItem) and item.label in my_labels:
3004
3038
  continue
3005
3039
 
3006
3040
  html_texts.append("</html>")
@@ -3037,6 +3071,7 @@ class DoclingDocument(BaseModel):
3037
3071
  "list_item": DocItemLabel.LIST_ITEM,
3038
3072
  "footnote": DocItemLabel.FOOTNOTE,
3039
3073
  "code": DocItemLabel.CODE,
3074
+ "key_value_region": DocItemLabel.KEY_VALUE_REGION,
3040
3075
  }
3041
3076
 
3042
3077
  def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
@@ -3189,7 +3224,7 @@ class DoclingDocument(BaseModel):
3189
3224
  token
3190
3225
  for token in tokens
3191
3226
  if not (
3192
- token.startswith(rf"<{DocumentToken.LOC.value}")
3227
+ token.startswith(rf"<{_LOC_PREFIX}")
3193
3228
  or token
3194
3229
  in [
3195
3230
  rf"<{DocumentToken.OTSL.value}>",
@@ -3203,7 +3238,7 @@ class DoclingDocument(BaseModel):
3203
3238
  token
3204
3239
  for token in text_parts
3205
3240
  if not (
3206
- token.startswith(rf"<{DocumentToken.LOC.value}")
3241
+ token.startswith(rf"<{_LOC_PREFIX}")
3207
3242
  or token
3208
3243
  in [
3209
3244
  rf"<{DocumentToken.OTSL.value}>",
@@ -3228,6 +3263,95 @@ class DoclingDocument(BaseModel):
3228
3263
  table_cells=table_cells,
3229
3264
  )
3230
3265
 
3266
+ def parse_key_value_item(
3267
+ tokens: str, image: Optional[PILImage.Image] = None
3268
+ ) -> Tuple[GraphData, Optional[ProvenanceItem]]:
3269
+ if image is not None:
3270
+ pg_width = image.width
3271
+ pg_height = image.height
3272
+ else:
3273
+ pg_width = 1
3274
+ pg_height = 1
3275
+
3276
+ start_locs_match = re.search(r"<key_value_region>(.*?)<key", tokens)
3277
+ if start_locs_match:
3278
+ overall_locs = start_locs_match.group(1)
3279
+ overall_bbox = extract_bounding_box(overall_locs) if image else None
3280
+ overall_prov = (
3281
+ ProvenanceItem(
3282
+ bbox=overall_bbox.resize_by_scale(pg_width, pg_height),
3283
+ charspan=(0, 0),
3284
+ page_no=1,
3285
+ )
3286
+ if overall_bbox
3287
+ else None
3288
+ )
3289
+ else:
3290
+ overall_prov = None
3291
+
3292
+ # here we assumed the labels as only key or value, later on we can update
3293
+ # it to have unspecified, checkbox etc.
3294
+ cell_pattern = re.compile(
3295
+ r"<(?P<label>key|value)_(?P<id>\d+)>"
3296
+ r"(?P<content>.*?)"
3297
+ r"</(?P=label)_(?P=id)>",
3298
+ re.DOTALL,
3299
+ )
3300
+
3301
+ cells: List["GraphCell"] = []
3302
+ links: List["GraphLink"] = []
3303
+ raw_link_predictions = []
3304
+
3305
+ for cell_match in cell_pattern.finditer(tokens):
3306
+ cell_label_str = cell_match.group("label") # "key" or "value"
3307
+ cell_id = int(cell_match.group("id"))
3308
+ raw_content = cell_match.group("content")
3309
+
3310
+ # link tokens
3311
+ link_matches = re.findall(r"<link_(\d+)>", raw_content)
3312
+
3313
+ cell_bbox = extract_bounding_box(raw_content) if image else None
3314
+ cell_prov = None
3315
+ if cell_bbox is not None:
3316
+ cell_prov = ProvenanceItem(
3317
+ bbox=cell_bbox.resize_by_scale(pg_width, pg_height),
3318
+ charspan=(0, 0),
3319
+ page_no=1,
3320
+ )
3321
+
3322
+ cleaned_text = re.sub(r"<loc_\d+>", "", raw_content)
3323
+ cleaned_text = re.sub(r"<link_\d+>", "", cleaned_text).strip()
3324
+
3325
+ cell_obj = GraphCell(
3326
+ label=GraphCellLabel(cell_label_str),
3327
+ cell_id=cell_id,
3328
+ text=cleaned_text,
3329
+ orig=cleaned_text,
3330
+ prov=cell_prov,
3331
+ item_ref=None,
3332
+ )
3333
+ cells.append(cell_obj)
3334
+
3335
+ cell_ids = {cell.cell_id for cell in cells}
3336
+
3337
+ for target_str in link_matches:
3338
+ raw_link_predictions.append((cell_id, int(target_str)))
3339
+
3340
+ cell_ids = {cell.cell_id for cell in cells}
3341
+
3342
+ for source_id, target_id in raw_link_predictions:
3343
+ # basic check to validate the prediction
3344
+ if target_id not in cell_ids:
3345
+ continue
3346
+ link_obj = GraphLink(
3347
+ label=GraphLinkLabel.TO_VALUE,
3348
+ source_cell_id=source_id,
3349
+ target_cell_id=target_id,
3350
+ )
3351
+ links.append(link_obj)
3352
+
3353
+ return (GraphData(cells=cells, links=links), overall_prov)
3354
+
3231
3355
  # doc = DoclingDocument(name="Document")
3232
3356
  for pg_idx, doctag_page in enumerate(doctag_document.pages):
3233
3357
  page_doctags = doctag_page.tokens
@@ -3243,6 +3367,12 @@ class DoclingDocument(BaseModel):
3243
3367
  pg_width = 1
3244
3368
  pg_height = 1
3245
3369
 
3370
+ self.add_page(
3371
+ page_no=page_no,
3372
+ size=Size(width=pg_width, height=pg_height),
3373
+ image=ImageRef.from_pil(image=image, dpi=72) if image else None,
3374
+ )
3375
+
3246
3376
  """
3247
3377
  1. Finds all <tag>...</tag>
3248
3378
  blocks in the entire string (multi-line friendly)
@@ -3263,6 +3393,7 @@ class DoclingDocument(BaseModel):
3263
3393
  rf"{DocItemLabel.SECTION_HEADER}_level_1|"
3264
3394
  rf"{DocumentToken.ORDERED_LIST.value}|"
3265
3395
  rf"{DocumentToken.UNORDERED_LIST.value}|"
3396
+ rf"{DocItemLabel.KEY_VALUE_REGION}|"
3266
3397
  rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
3267
3398
  )
3268
3399
 
@@ -3348,6 +3479,11 @@ class DoclingDocument(BaseModel):
3348
3479
  parent=None,
3349
3480
  )
3350
3481
  pic.captions.append(caption_item.get_ref())
3482
+ elif tag_name == DocItemLabel.KEY_VALUE_REGION:
3483
+ key_value_data, kv_item_prov = parse_key_value_item(
3484
+ full_chunk, image
3485
+ )
3486
+ self.add_key_values(graph=key_value_data, prov=kv_item_prov)
3351
3487
  elif tag_name in [
3352
3488
  DocumentToken.ORDERED_LIST.value,
3353
3489
  DocumentToken.UNORDERED_LIST.value,
@@ -3392,18 +3528,25 @@ class DoclingDocument(BaseModel):
3392
3528
  else:
3393
3529
  # For everything else, treat as text
3394
3530
  text_content = extract_inner_text(full_chunk)
3531
+ element_prov = (
3532
+ ProvenanceItem(
3533
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
3534
+ charspan=(0, len(text_content)),
3535
+ page_no=page_no,
3536
+ )
3537
+ if bbox
3538
+ else None
3539
+ )
3540
+
3541
+ content_layer = ContentLayer.BODY
3542
+ if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
3543
+ content_layer = ContentLayer.FURNITURE
3544
+
3395
3545
  self.add_text(
3396
3546
  label=doc_label,
3397
3547
  text=text_content,
3398
- prov=(
3399
- ProvenanceItem(
3400
- bbox=bbox.resize_by_scale(pg_width, pg_height),
3401
- charspan=(0, len(text_content)),
3402
- page_no=page_no,
3403
- )
3404
- if bbox
3405
- else None
3406
- ),
3548
+ prov=element_prov,
3549
+ content_layer=content_layer,
3407
3550
  )
3408
3551
  return self
3409
3552
 
@@ -3414,11 +3557,11 @@ class DoclingDocument(BaseModel):
3414
3557
 
3415
3558
  def save_as_doctags(
3416
3559
  self,
3417
- filename: Path,
3560
+ filename: Union[str, Path],
3418
3561
  delim: str = "",
3419
3562
  from_element: int = 0,
3420
3563
  to_element: int = sys.maxsize,
3421
- labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
3564
+ labels: Optional[set[DocItemLabel]] = None,
3422
3565
  xsize: int = 500,
3423
3566
  ysize: int = 500,
3424
3567
  add_location: bool = True,
@@ -3427,9 +3570,12 @@ class DoclingDocument(BaseModel):
3427
3570
  # table specific flags
3428
3571
  add_table_cell_location: bool = False,
3429
3572
  add_table_cell_text: bool = True,
3573
+ minified: bool = False,
3430
3574
  ):
3431
3575
  r"""Save the document content to DocTags format."""
3432
- out = self.export_to_document_tokens(
3576
+ if isinstance(filename, str):
3577
+ filename = Path(filename)
3578
+ out = self.export_to_doctags(
3433
3579
  delim=delim,
3434
3580
  from_element=from_element,
3435
3581
  to_element=to_element,
@@ -3442,17 +3588,23 @@ class DoclingDocument(BaseModel):
3442
3588
  # table specific flags
3443
3589
  add_table_cell_location=add_table_cell_location,
3444
3590
  add_table_cell_text=add_table_cell_text,
3591
+ minified=minified,
3445
3592
  )
3446
3593
 
3447
3594
  with open(filename, "w", encoding="utf-8") as fw:
3448
3595
  fw.write(out)
3449
3596
 
3450
- def export_to_document_tokens( # noqa: C901
3597
+ @deprecated("Use export_to_doctags() instead.")
3598
+ def export_to_document_tokens(self, *args, **kwargs):
3599
+ r"""Export to DocTags format."""
3600
+ return self.export_to_doctags(*args, **kwargs)
3601
+
3602
+ def export_to_doctags( # noqa: C901
3451
3603
  self,
3452
- delim: str = "",
3604
+ delim: str = "", # deprecated
3453
3605
  from_element: int = 0,
3454
3606
  to_element: int = sys.maxsize,
3455
- labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
3607
+ labels: Optional[set[DocItemLabel]] = None,
3456
3608
  xsize: int = 500,
3457
3609
  ysize: int = 500,
3458
3610
  add_location: bool = True,
@@ -3461,13 +3613,14 @@ class DoclingDocument(BaseModel):
3461
3613
  # table specific flags
3462
3614
  add_table_cell_location: bool = False,
3463
3615
  add_table_cell_text: bool = True,
3616
+ minified: bool = False,
3464
3617
  ) -> str:
3465
3618
  r"""Exports the document content to a DocumentToken format.
3466
3619
 
3467
3620
  Operates on a slice of the document's body as defined through arguments
3468
3621
  from_element and to_element; defaulting to the whole main_text.
3469
3622
 
3470
- :param delim: str: (Default value = "")
3623
+ :param delim: str: (Default value = "") Deprecated
3471
3624
  :param from_element: int: (Default value = 0)
3472
3625
  :param to_element: Optional[int]: (Default value = None)
3473
3626
  :param labels: set[DocItemLabel]
@@ -3478,199 +3631,40 @@ class DoclingDocument(BaseModel):
3478
3631
  :param add_page_index: bool: (Default value = True)
3479
3632
  :param # table specific flagsadd_table_cell_location: bool
3480
3633
  :param add_table_cell_text: bool: (Default value = True)
3634
+ :param minified: bool: (Default value = False)
3481
3635
  :returns: The content of the document formatted as a DocTags string.
3482
3636
  :rtype: str
3483
3637
  """
3484
-
3485
- def _close_lists(
3486
- current_level: int,
3487
- previous_level: int,
3488
- ordered_list_stack: List[bool],
3489
- output_parts: List[str],
3490
- ) -> List[bool]:
3491
- """Close open list tags until the nesting level matches item's level."""
3492
- while current_level < previous_level and ordered_list_stack:
3493
- last_is_ordered = ordered_list_stack.pop()
3494
- if last_is_ordered:
3495
- output_parts.append(f"</{DocumentToken.ORDERED_LIST.value}>\n")
3496
- else:
3497
- output_parts.append(f"</{DocumentToken.UNORDERED_LIST.value}>\n")
3498
- previous_level -= 1
3499
- return ordered_list_stack
3500
-
3501
- def _add_page_break_if_needed(
3502
- output_parts: List[str],
3503
- item,
3504
- prev_page_no,
3505
- page_break_enabled: bool,
3506
- ):
3507
- """Inserts a page-break token.
3508
-
3509
- Inserts a page-break token if the item's page number is different
3510
- from the previous item and page breaks are enabled.
3511
- Returns the updated output_parts list and the current page number.
3512
- """
3513
- if not page_break_enabled:
3514
- return output_parts, prev_page_no
3515
-
3516
- if not item.prov:
3517
- return output_parts, prev_page_no
3518
-
3519
- current_page_no = item.prov[0].page_no
3520
- if prev_page_no is None:
3521
- return output_parts, current_page_no
3522
-
3523
- if current_page_no != prev_page_no:
3524
- output_parts.append(f"<{DocumentToken.PAGE_BREAK.value}>\n")
3525
-
3526
- return output_parts, current_page_no
3527
-
3528
- def _get_standalone_captions(document_body):
3529
- """Identify captions that are not attached to any table or figure."""
3530
- all_captions = set()
3531
- matched_captions = set()
3532
- for item, _ in self.iterate_items(document_body, with_groups=True):
3533
- if item.label == DocItemLabel.CAPTION:
3534
- all_captions.update([item.self_ref])
3535
- if item.label in [DocItemLabel.PICTURE, DocItemLabel.TABLE]:
3536
- matched_captions.update([caption.cref for caption in item.captions])
3537
-
3538
- return all_captions - matched_captions
3539
-
3540
- # Initialization
3541
- output_parts: List[str] = []
3542
- ordered_list_stack: List[bool] = []
3543
- previous_level = 0
3544
- previous_page_no = None
3545
-
3546
- # Precompute standalone captions
3547
- standalone_captions = _get_standalone_captions(self.body)
3548
-
3549
- # Begin document
3550
- output_parts.append(f"<{DocumentToken.DOCUMENT.value}>{delim}")
3551
-
3552
- for ix, (item, current_level) in enumerate(
3553
- self.iterate_items(
3554
- self.body,
3555
- with_groups=True,
3556
- included_content_layers={
3557
- ContentLayer.BODY,
3558
- ContentLayer.FURNITURE,
3559
- },
3560
- )
3561
- ):
3562
- # Close lists if we've moved to a lower nesting level
3563
- if current_level < previous_level and ordered_list_stack:
3564
- ordered_list_stack = _close_lists(
3565
- current_level,
3566
- previous_level,
3567
- ordered_list_stack,
3568
- output_parts,
3569
- )
3570
- previous_level = current_level
3571
-
3572
- # Skip items outside the specified element range
3573
- if ix < from_element or ix >= to_element:
3574
- continue
3575
-
3576
- # Skip items whose label is not in the allowed set
3577
- if isinstance(item, DocItem) and (item.label not in labels):
3578
- continue
3579
-
3580
- # Skip captions that are not standalone as they will be included below
3581
- # by the export functions of Table and Picture
3582
- if (
3583
- isinstance(item, TextItem)
3584
- and item.label == DocItemLabel.CAPTION
3585
- and item.self_ref not in standalone_captions
3586
- ):
3587
- continue
3588
-
3589
- # Handle list groups
3590
- if isinstance(item, GroupItem):
3591
- if item.label == GroupLabel.ORDERED_LIST:
3592
- output_parts.append(f"<{DocumentToken.ORDERED_LIST.value}>{delim}")
3593
- ordered_list_stack.append(True)
3594
- elif item.label == GroupLabel.LIST:
3595
- output_parts.append(
3596
- f"<{DocumentToken.UNORDERED_LIST.value}>{delim}"
3597
- )
3598
- ordered_list_stack.append(False)
3599
- continue
3600
-
3601
- # For other item types, optionally insert page-break if the page changed
3602
- output_parts, previous_page_no = _add_page_break_if_needed(
3603
- output_parts, item, previous_page_no, add_page_index
3604
- )
3605
-
3606
- if isinstance(item, SectionHeaderItem):
3607
- output_parts.append(
3608
- item.export_to_document_tokens(
3609
- doc=self,
3610
- new_line=delim,
3611
- xsize=xsize,
3612
- ysize=ysize,
3613
- add_location=add_location,
3614
- add_content=add_content,
3615
- )
3616
- )
3617
- elif isinstance(item, CodeItem):
3618
- output_parts.append(
3619
- item.export_to_document_tokens(
3620
- doc=self,
3621
- new_line=delim,
3622
- xsize=xsize,
3623
- ysize=ysize,
3624
- add_location=add_location,
3625
- add_content=add_content,
3626
- )
3627
- )
3628
- elif isinstance(item, TextItem):
3629
- output_parts.append(
3630
- item.export_to_document_tokens(
3631
- doc=self,
3632
- new_line=delim,
3633
- xsize=xsize,
3634
- ysize=ysize,
3635
- add_location=add_location,
3636
- add_content=add_content,
3637
- )
3638
- )
3639
- elif isinstance(item, TableItem):
3640
- output_parts.append(
3641
- item.export_to_document_tokens(
3642
- doc=self,
3643
- new_line=delim,
3644
- xsize=xsize,
3645
- ysize=ysize,
3646
- add_location=add_location,
3647
- add_cell_location=add_table_cell_location,
3648
- add_cell_text=add_table_cell_text,
3649
- add_caption=True,
3650
- )
3651
- )
3652
- elif isinstance(item, PictureItem):
3653
- output_parts.append(
3654
- item.export_to_document_tokens(
3655
- doc=self,
3656
- new_line=delim,
3657
- xsize=xsize,
3658
- ysize=ysize,
3659
- add_caption=True,
3660
- add_location=add_location,
3661
- add_content=add_content,
3662
- )
3663
- )
3664
-
3665
- # End any lists that might still be open
3666
- ordered_list_stack = _close_lists(
3667
- 0, previous_level, ordered_list_stack, output_parts
3638
+ from docling_core.experimental.serializer.doctags import (
3639
+ DocTagsDocSerializer,
3640
+ DocTagsParams,
3668
3641
  )
3669
3642
 
3670
- # End document
3671
- output_parts.append(f"</{DocumentToken.DOCUMENT.value}>")
3672
-
3673
- return "".join(output_parts)
3643
+ my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
3644
+ serializer = DocTagsDocSerializer(
3645
+ doc=self,
3646
+ params=DocTagsParams(
3647
+ labels=my_labels,
3648
+ # layers=..., # not exposed
3649
+ start_idx=from_element,
3650
+ stop_idx=to_element,
3651
+ xsize=xsize,
3652
+ ysize=ysize,
3653
+ add_location=add_location,
3654
+ # add_caption=..., # not exposed
3655
+ add_content=add_content,
3656
+ add_page_break=add_page_index,
3657
+ add_table_cell_location=add_table_cell_location,
3658
+ add_table_cell_text=add_table_cell_text,
3659
+ mode=(
3660
+ DocTagsParams.Mode.MINIFIED
3661
+ if minified
3662
+ else DocTagsParams.Mode.HUMAN_FRIENDLY
3663
+ ),
3664
+ ),
3665
+ )
3666
+ ser_res = serializer.serialize()
3667
+ return ser_res.text
3674
3668
 
3675
3669
  def _export_to_indented_text(
3676
3670
  self,