docling-core 2.18.1__py3-none-any.whl → 2.19.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -19,6 +19,7 @@ from docling_core.search.package import VERSION_PATTERN
19
19
  from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
20
20
  from docling_core.types import DoclingDocument as DLDocument
21
21
  from docling_core.types.doc.document import (
22
+ CodeItem,
22
23
  DocItem,
23
24
  DocumentOrigin,
24
25
  LevelNumber,
@@ -199,8 +200,10 @@ class HierarchicalChunker(BaseChunker):
199
200
  heading_by_level.pop(k, None)
200
201
  continue
201
202
 
202
- if isinstance(item, TextItem) or (
203
- (not self.merge_list_items) and isinstance(item, ListItem)
203
+ if (
204
+ isinstance(item, TextItem)
205
+ or ((not self.merge_list_items) and isinstance(item, ListItem))
206
+ or isinstance(item, CodeItem)
204
207
  ):
205
208
  text = item.text
206
209
  elif isinstance(item, TableItem):
@@ -75,6 +75,14 @@ DEFAULT_EXPORT_LABELS = {
75
75
  DocItemLabel.PAGE_FOOTER,
76
76
  }
77
77
 
78
+ DOCUMENT_TOKENS_EXPORT_LABELS = DEFAULT_EXPORT_LABELS.copy()
79
+ DOCUMENT_TOKENS_EXPORT_LABELS.update(
80
+ [
81
+ DocItemLabel.FOOTNOTE,
82
+ DocItemLabel.CAPTION,
83
+ ]
84
+ )
85
+
78
86
 
79
87
  class BasePictureData(BaseModel):
80
88
  """BasePictureData."""
@@ -564,9 +572,8 @@ class DocItem(
564
572
  self,
565
573
  doc: "DoclingDocument",
566
574
  new_line: str,
567
- xsize: int = 100,
568
- ysize: int = 100,
569
- add_page_index: bool = True,
575
+ xsize: int = 500,
576
+ ysize: int = 500,
570
577
  ) -> str:
571
578
  """Get the location string for the BaseCell."""
572
579
  if not len(self.prov):
@@ -576,17 +583,12 @@ class DocItem(
576
583
  for prov in self.prov:
577
584
  page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
578
585
 
579
- page_i = -1
580
- if add_page_index:
581
- page_i = prov.page_no
582
-
583
586
  loc_str = DocumentToken.get_location(
584
- bbox=prov.bbox.to_bottom_left_origin(page_h).as_tuple(),
587
+ bbox=prov.bbox.to_top_left_origin(page_h).as_tuple(),
585
588
  page_w=page_w,
586
589
  page_h=page_h,
587
590
  xsize=xsize,
588
591
  ysize=ysize,
589
- page_i=page_i,
590
592
  )
591
593
  location += f"{loc_str}{new_line}"
592
594
 
@@ -641,57 +643,40 @@ class TextItem(DocItem):
641
643
  def export_to_document_tokens(
642
644
  self,
643
645
  doc: "DoclingDocument",
644
- new_line: str = "\n",
645
- xsize: int = 100,
646
- ysize: int = 100,
646
+ new_line: str = "",
647
+ xsize: int = 500,
648
+ ysize: int = 500,
647
649
  add_location: bool = True,
648
650
  add_content: bool = True,
649
- add_page_index: bool = True,
650
651
  ):
651
652
  r"""Export text element to document tokens format.
652
653
 
653
654
  :param doc: "DoclingDocument":
654
- :param new_line: str: (Default value = "\n")
655
- :param xsize: int: (Default value = 100)
656
- :param ysize: int: (Default value = 100)
655
+ :param new_line: str (Default value = "")
656
+ :param xsize: int: (Default value = 500)
657
+ :param ysize: int: (Default value = 500)
657
658
  :param add_location: bool: (Default value = True)
658
659
  :param add_content: bool: (Default value = True)
659
- :param add_page_index: bool: (Default value = True)
660
660
 
661
661
  """
662
- body = f"<{self.label.value}>"
663
-
664
- # TODO: This must be done through an explicit mapping.
665
- # assert DocumentToken.is_known_token(
666
- # body
667
- # ), f"failed DocumentToken.is_known_token({body})"
662
+ body = f"<{self.label.value}>{new_line}"
668
663
 
669
664
  if add_location:
670
665
  body += self.get_location_tokens(
671
666
  doc=doc,
672
- new_line="",
667
+ new_line=new_line,
673
668
  xsize=xsize,
674
669
  ysize=ysize,
675
- add_page_index=add_page_index,
676
670
  )
677
671
 
678
672
  if add_content and self.text is not None:
679
- body += self.text.strip()
673
+ body += f"{self.text.strip()}{new_line}"
680
674
 
681
- body += f"</{self.label.value}>{new_line}"
675
+ body += f"</{self.label.value}>\n"
682
676
 
683
677
  return body
684
678
 
685
679
 
686
- class CodeItem(TextItem):
687
- """CodeItem."""
688
-
689
- label: typing.Literal[DocItemLabel.CODE] = (
690
- DocItemLabel.CODE # type: ignore[assignment]
691
- )
692
- code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
693
-
694
-
695
680
  class SectionHeaderItem(TextItem):
696
681
  """SectionItem."""
697
682
 
@@ -703,25 +688,23 @@ class SectionHeaderItem(TextItem):
703
688
  def export_to_document_tokens(
704
689
  self,
705
690
  doc: "DoclingDocument",
706
- new_line: str = "\n",
707
- xsize: int = 100,
708
- ysize: int = 100,
691
+ new_line: str = "",
692
+ xsize: int = 500,
693
+ ysize: int = 500,
709
694
  add_location: bool = True,
710
695
  add_content: bool = True,
711
- add_page_index: bool = True,
712
696
  ):
713
697
  r"""Export text element to document tokens format.
714
698
 
715
699
  :param doc: "DoclingDocument":
716
- :param new_line: str: (Default value = "\n")
717
- :param xsize: int: (Default value = 100)
718
- :param ysize: int: (Default value = 100)
700
+ :param new_line: str (Default value = "")
701
+ :param xsize: int: (Default value = 500)
702
+ :param ysize: int: (Default value = 500)
719
703
  :param add_location: bool: (Default value = True)
720
704
  :param add_content: bool: (Default value = True)
721
- :param add_page_index: bool: (Default value = True)
722
705
 
723
706
  """
724
- body = f"<{self.label.value}_level_{self.level}>"
707
+ body = f"<{self.label.value}_level_{self.level}>{new_line}"
725
708
 
726
709
  # TODO: This must be done through an explicit mapping.
727
710
  # assert DocumentToken.is_known_token(
@@ -731,16 +714,15 @@ class SectionHeaderItem(TextItem):
731
714
  if add_location:
732
715
  body += self.get_location_tokens(
733
716
  doc=doc,
734
- new_line="",
717
+ new_line=new_line,
735
718
  xsize=xsize,
736
719
  ysize=ysize,
737
- add_page_index=add_page_index,
738
720
  )
739
721
 
740
722
  if add_content and self.text is not None:
741
- body += self.text.strip()
723
+ body += f"{self.text.strip()}{new_line}"
742
724
 
743
- body += f"</{self.label.value}_level_{self.level}>{new_line}"
725
+ body += f"</{self.label.value}_level_{self.level}>\n"
744
726
 
745
727
  return body
746
728
 
@@ -785,6 +767,51 @@ class FloatingItem(DocItem):
785
767
  return super().get_image(doc=doc)
786
768
 
787
769
 
770
+ class CodeItem(FloatingItem, TextItem):
771
+ """CodeItem."""
772
+
773
+ label: typing.Literal[DocItemLabel.CODE] = (
774
+ DocItemLabel.CODE # type: ignore[assignment]
775
+ )
776
+ code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
777
+
778
+ def export_to_document_tokens(
779
+ self,
780
+ doc: "DoclingDocument",
781
+ new_line: str = "",
782
+ xsize: int = 500,
783
+ ysize: int = 500,
784
+ add_location: bool = True,
785
+ add_content: bool = True,
786
+ ):
787
+ r"""Export text element to document tokens format.
788
+
789
+ :param doc: "DoclingDocument":
790
+ :param new_line: str (Default value = "")
791
+ :param xsize: int: (Default value = 500)
792
+ :param ysize: int: (Default value = 500)
793
+ :param add_location: bool: (Default value = True)
794
+ :param add_content: bool: (Default value = True)
795
+
796
+ """
797
+ body = f"<{self.label.value}{new_line}"
798
+
799
+ if add_location:
800
+ body += self.get_location_tokens(
801
+ doc=doc,
802
+ new_line=new_line,
803
+ xsize=xsize,
804
+ ysize=ysize,
805
+ )
806
+
807
+ if add_content and self.text is not None:
808
+ body += f"<_{self.code_language.value}_>{self.text}{new_line}"
809
+
810
+ body += f"</{self.label.value}\n"
811
+
812
+ return body
813
+
814
+
788
815
  class PictureItem(FloatingItem):
789
816
  """PictureItem."""
790
817
 
@@ -931,47 +958,62 @@ class PictureItem(FloatingItem):
931
958
  def export_to_document_tokens(
932
959
  self,
933
960
  doc: "DoclingDocument",
934
- new_line: str = "\n",
935
- xsize: int = 100,
936
- ysize: int = 100,
961
+ new_line: str = "",
962
+ xsize: int = 500,
963
+ ysize: int = 500,
937
964
  add_location: bool = True,
938
965
  add_caption: bool = True,
939
966
  add_content: bool = True, # not used at the moment
940
- add_page_index: bool = True,
941
967
  ):
942
968
  r"""Export picture to document tokens format.
943
969
 
944
970
  :param doc: "DoclingDocument":
945
- :param new_line: str: (Default value = "\n")
946
- :param xsize: int: (Default value = 100)
947
- :param ysize: int: (Default value = 100)
971
+ :param new_line: str (Default value = "")
972
+ :param xsize: int: (Default value = 500)
973
+ :param ysize: int: (Default value = 500)
948
974
  :param add_location: bool: (Default value = True)
949
975
  :param add_caption: bool: (Default value = True)
950
976
  :param add_content: bool: (Default value = True)
951
- :param # not used at the momentadd_page_index: bool: (Default value = True)
977
+ :param # not used at the moment
952
978
 
953
979
  """
954
- body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
955
-
980
+ body = f"<{self.label.value}>{new_line}"
956
981
  if add_location:
957
982
  body += self.get_location_tokens(
958
983
  doc=doc,
959
984
  new_line=new_line,
960
985
  xsize=xsize,
961
986
  ysize=ysize,
962
- add_page_index=add_page_index,
963
987
  )
964
988
 
989
+ classifications = [
990
+ ann
991
+ for ann in self.annotations
992
+ if isinstance(ann, PictureClassificationData)
993
+ ]
994
+ if len(classifications) > 0:
995
+ # ! TODO: currently this code assumes class_name is of type 'str'
996
+ # ! TODO: when it will change to an ENUM --> adapt code
997
+ predicted_class = classifications[0].predicted_classes[0].class_name
998
+ body += DocumentToken.get_picture_classification_token(predicted_class)
999
+
965
1000
  if add_caption and len(self.captions):
966
1001
  text = self.caption_text(doc)
967
1002
 
968
1003
  if len(text):
969
- body += f"{DocumentToken.BEG_CAPTION.value}"
1004
+ body += f"<{DocItemLabel.CAPTION.value}>"
1005
+ for caption in self.captions:
1006
+ body += caption.resolve(doc).get_location_tokens(
1007
+ doc=doc,
1008
+ new_line=new_line,
1009
+ xsize=xsize,
1010
+ ysize=ysize,
1011
+ )
970
1012
  body += f"{text.strip()}"
971
- body += f"{DocumentToken.END_CAPTION.value}"
1013
+ body += f"</{DocItemLabel.CAPTION.value}>"
972
1014
  body += f"{new_line}"
973
1015
 
974
- body += f"{DocumentToken.END_FIGURE.value}{new_line}"
1016
+ body += f"</{self.label.value}>\n"
975
1017
 
976
1018
  return body
977
1019
 
@@ -1143,8 +1185,8 @@ class TableItem(FloatingItem):
1143
1185
  doc: "DoclingDocument",
1144
1186
  add_cell_location: bool = True,
1145
1187
  add_cell_text: bool = True,
1146
- xsize: int = 100,
1147
- ysize: int = 100,
1188
+ xsize: int = 500,
1189
+ ysize: int = 500,
1148
1190
  ) -> str:
1149
1191
  """Export the table as OTSL."""
1150
1192
  # Possible OTSL tokens...
@@ -1194,7 +1236,6 @@ class TableItem(FloatingItem):
1194
1236
  page_h=page_h,
1195
1237
  xsize=xsize,
1196
1238
  ysize=ysize,
1197
- page_i=page_no,
1198
1239
  )
1199
1240
 
1200
1241
  if rowstart == i and colstart == j:
@@ -1234,33 +1275,29 @@ class TableItem(FloatingItem):
1234
1275
  def export_to_document_tokens(
1235
1276
  self,
1236
1277
  doc: "DoclingDocument",
1237
- new_line: str = "\n",
1238
- xsize: int = 100,
1239
- ysize: int = 100,
1278
+ new_line: str = "",
1279
+ xsize: int = 500,
1280
+ ysize: int = 500,
1240
1281
  add_location: bool = True,
1241
- add_caption: bool = True,
1242
- add_content: bool = True,
1243
1282
  add_cell_location: bool = True,
1244
- add_cell_label: bool = True,
1245
1283
  add_cell_text: bool = True,
1246
- add_page_index: bool = True,
1284
+ add_caption: bool = True,
1247
1285
  ):
1248
1286
  r"""Export table to document tokens format.
1249
1287
 
1250
1288
  :param doc: "DoclingDocument":
1251
- :param new_line: str: (Default value = "\n")
1252
- :param xsize: int: (Default value = 100)
1253
- :param ysize: int: (Default value = 100)
1289
+ :param new_line: str (Default value = "")
1290
+ :param xsize: int: (Default value = 500)
1291
+ :param ysize: int: (Default value = 500)
1254
1292
  :param add_location: bool: (Default value = True)
1255
- :param add_caption: bool: (Default value = True)
1256
- :param add_content: bool: (Default value = True)
1257
1293
  :param add_cell_location: bool: (Default value = True)
1258
- :param add_cell_label: bool: (Default value = True)
1259
1294
  :param add_cell_text: bool: (Default value = True)
1260
- :param add_page_index: bool: (Default value = True)
1295
+ :param add_caption: bool: (Default value = True)
1261
1296
 
1262
1297
  """
1263
- body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
1298
+ otsl_tag = DocumentToken.OTSL.value
1299
+
1300
+ body = f"<{otsl_tag}>{new_line}"
1264
1301
 
1265
1302
  if add_location:
1266
1303
  body += self.get_location_tokens(
@@ -1268,76 +1305,27 @@ class TableItem(FloatingItem):
1268
1305
  new_line=new_line,
1269
1306
  xsize=xsize,
1270
1307
  ysize=ysize,
1271
- add_page_index=add_page_index,
1272
1308
  )
1273
1309
 
1310
+ body += self.export_to_otsl(doc, add_cell_location, add_cell_text, xsize, ysize)
1311
+
1274
1312
  if add_caption and len(self.captions):
1275
1313
  text = self.caption_text(doc)
1276
1314
 
1277
1315
  if len(text):
1278
- body += f"{DocumentToken.BEG_CAPTION.value}"
1316
+ body += f"<{DocItemLabel.CAPTION.value}>"
1317
+ for caption in self.captions:
1318
+ body += caption.resolve(doc).get_location_tokens(
1319
+ doc=doc,
1320
+ new_line=new_line,
1321
+ xsize=xsize,
1322
+ ysize=ysize,
1323
+ )
1279
1324
  body += f"{text.strip()}"
1280
- body += f"{DocumentToken.END_CAPTION.value}"
1325
+ body += f"</{DocItemLabel.CAPTION.value}>"
1281
1326
  body += f"{new_line}"
1282
1327
 
1283
- if add_content and len(self.data.table_cells) > 0:
1284
- for i, row in enumerate(self.data.grid):
1285
- body += f"<row_{i}>"
1286
- for j, col in enumerate(row):
1287
-
1288
- text = ""
1289
- if add_cell_text:
1290
- text = col.text.strip()
1291
-
1292
- cell_loc = ""
1293
- if (
1294
- col.bbox is not None
1295
- and add_cell_location
1296
- and add_page_index
1297
- and len(self.prov) > 0
1298
- ):
1299
- page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
1300
- cell_loc = DocumentToken.get_location(
1301
- bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
1302
- page_w=page_w,
1303
- page_h=page_h,
1304
- xsize=xsize,
1305
- ysize=ysize,
1306
- page_i=self.prov[0].page_no,
1307
- )
1308
- elif (
1309
- col.bbox is not None
1310
- and add_cell_location
1311
- and not add_page_index
1312
- and len(self.prov) > 0
1313
- ):
1314
- page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
1315
-
1316
- cell_loc = DocumentToken.get_location(
1317
- bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
1318
- page_w=page_w,
1319
- page_h=page_h,
1320
- xsize=xsize,
1321
- ysize=ysize,
1322
- page_i=-1,
1323
- )
1324
-
1325
- cell_label = ""
1326
- if add_cell_label:
1327
- if col.column_header:
1328
- cell_label = "<col_header>"
1329
- elif col.row_header:
1330
- cell_label = "<row_header>"
1331
- elif col.row_section:
1332
- cell_label = "<row_section>"
1333
- else:
1334
- cell_label = "<body>"
1335
-
1336
- body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
1337
-
1338
- body += f"</row_{i}>{new_line}"
1339
-
1340
- body += f"{DocumentToken.END_TABLE.value}{new_line}"
1328
+ body += f"</{otsl_tag}>\n"
1341
1329
 
1342
1330
  return body
1343
1331
 
@@ -1777,6 +1765,7 @@ class DoclingDocument(BaseModel):
1777
1765
  text: str,
1778
1766
  code_language: Optional[CodeLanguageLabel] = None,
1779
1767
  orig: Optional[str] = None,
1768
+ caption: Optional[Union[TextItem, RefItem]] = None,
1780
1769
  prov: Optional[ProvenanceItem] = None,
1781
1770
  parent: Optional[NodeItem] = None,
1782
1771
  content_layer: Optional[ContentLayer] = None,
@@ -1786,6 +1775,8 @@ class DoclingDocument(BaseModel):
1786
1775
  :param text: str:
1787
1776
  :param code_language: Optional[str]: (Default value = None)
1788
1777
  :param orig: Optional[str]: (Default value = None)
1778
+ :param caption: Optional[Union[TextItem:
1779
+ :param RefItem]]: (Default value = None)
1789
1780
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1790
1781
  :param parent: Optional[NodeItem]: (Default value = None)
1791
1782
  """
@@ -1809,6 +1800,8 @@ class DoclingDocument(BaseModel):
1809
1800
  code_item.content_layer = content_layer
1810
1801
  if prov:
1811
1802
  code_item.prov.append(prov)
1803
+ if caption:
1804
+ code_item.captions.append(caption.get_ref())
1812
1805
 
1813
1806
  self.texts.append(code_item)
1814
1807
  parent.children.append(RefItem(cref=cref))
@@ -1927,6 +1920,7 @@ class DoclingDocument(BaseModel):
1927
1920
  traverse_pictures=traverse_pictures,
1928
1921
  page_no=page_no,
1929
1922
  _level=_level + 1,
1923
+ included_content_layers=included_content_layers,
1930
1924
  )
1931
1925
 
1932
1926
  def _clear_picture_pil_cache(self):
@@ -2132,6 +2126,7 @@ class DoclingDocument(BaseModel):
2132
2126
  indent: int = 4,
2133
2127
  text_width: int = -1,
2134
2128
  page_no: Optional[int] = None,
2129
+ included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
2135
2130
  ):
2136
2131
  """Save to markdown."""
2137
2132
  artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
@@ -2155,6 +2150,7 @@ class DoclingDocument(BaseModel):
2155
2150
  indent=indent,
2156
2151
  text_width=text_width,
2157
2152
  page_no=page_no,
2153
+ included_content_layers=included_content_layers,
2158
2154
  )
2159
2155
 
2160
2156
  with open(filename, "w", encoding="utf-8") as fw:
@@ -2173,6 +2169,7 @@ class DoclingDocument(BaseModel):
2173
2169
  indent: int = 4,
2174
2170
  text_width: int = -1,
2175
2171
  page_no: Optional[int] = None,
2172
+ included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
2176
2173
  ) -> str:
2177
2174
  r"""Serialize to Markdown.
2178
2175
 
@@ -2254,7 +2251,12 @@ class DoclingDocument(BaseModel):
2254
2251
  mdtexts.append(text)
2255
2252
 
2256
2253
  for ix, (item, level) in enumerate(
2257
- self.iterate_items(self.body, with_groups=True, page_no=page_no)
2254
+ self.iterate_items(
2255
+ self.body,
2256
+ with_groups=True,
2257
+ page_no=page_no,
2258
+ included_content_layers=included_content_layers,
2259
+ )
2258
2260
  ):
2259
2261
  # If we've moved to a lower level, we're exiting one or more groups
2260
2262
  if level < previous_level:
@@ -2423,6 +2425,7 @@ class DoclingDocument(BaseModel):
2423
2425
  page_no: Optional[int] = None,
2424
2426
  html_lang: str = "en",
2425
2427
  html_head: str = _HTML_DEFAULT_HEAD,
2428
+ included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
2426
2429
  ):
2427
2430
  """Save to HTML."""
2428
2431
  artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
@@ -2443,6 +2446,7 @@ class DoclingDocument(BaseModel):
2443
2446
  page_no=page_no,
2444
2447
  html_lang=html_lang,
2445
2448
  html_head=html_head,
2449
+ included_content_layers=included_content_layers,
2446
2450
  )
2447
2451
 
2448
2452
  with open(filename, "w", encoding="utf-8") as fw:
@@ -2490,6 +2494,7 @@ class DoclingDocument(BaseModel):
2490
2494
  page_no: Optional[int] = None,
2491
2495
  html_lang: str = "en",
2492
2496
  html_head: str = _HTML_DEFAULT_HEAD,
2497
+ included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
2493
2498
  ) -> str:
2494
2499
  r"""Serialize to HTML."""
2495
2500
 
@@ -2531,7 +2536,12 @@ class DoclingDocument(BaseModel):
2531
2536
  return text
2532
2537
 
2533
2538
  for ix, (item, curr_level) in enumerate(
2534
- self.iterate_items(self.body, with_groups=True, page_no=page_no)
2539
+ self.iterate_items(
2540
+ self.body,
2541
+ with_groups=True,
2542
+ page_no=page_no,
2543
+ included_content_layers=included_content_layers,
2544
+ )
2535
2545
  ):
2536
2546
  # If we've moved to a lower level, we're exiting one or more groups
2537
2547
  if curr_level < prev_level and len(in_ordered_list) > 0:
@@ -2708,22 +2718,18 @@ class DoclingDocument(BaseModel):
2708
2718
  def save_as_document_tokens(
2709
2719
  self,
2710
2720
  filename: Path,
2711
- delim: str = "\n\n",
2721
+ delim: str = "",
2712
2722
  from_element: int = 0,
2713
2723
  to_element: int = sys.maxsize,
2714
- labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2715
- xsize: int = 100,
2716
- ysize: int = 100,
2724
+ labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
2725
+ xsize: int = 500,
2726
+ ysize: int = 500,
2717
2727
  add_location: bool = True,
2718
2728
  add_content: bool = True,
2719
2729
  add_page_index: bool = True,
2720
2730
  # table specific flags
2721
2731
  add_table_cell_location: bool = False,
2722
- add_table_cell_label: bool = True,
2723
2732
  add_table_cell_text: bool = True,
2724
- # specifics
2725
- page_no: Optional[int] = None,
2726
- with_groups: bool = True,
2727
2733
  ):
2728
2734
  r"""Save the document content to a DocumentToken format."""
2729
2735
  out = self.export_to_document_tokens(
@@ -2738,198 +2744,230 @@ class DoclingDocument(BaseModel):
2738
2744
  add_page_index=add_page_index,
2739
2745
  # table specific flags
2740
2746
  add_table_cell_location=add_table_cell_location,
2741
- add_table_cell_label=add_table_cell_label,
2742
2747
  add_table_cell_text=add_table_cell_text,
2743
- # specifics
2744
- page_no=page_no,
2745
- with_groups=with_groups,
2746
2748
  )
2747
2749
 
2748
2750
  with open(filename, "w", encoding="utf-8") as fw:
2749
2751
  fw.write(out)
2750
2752
 
2751
- def export_to_document_tokens(
2753
+ def export_to_document_tokens( # noqa: C901
2752
2754
  self,
2753
- delim: str = "\n",
2755
+ delim: str = "",
2754
2756
  from_element: int = 0,
2755
2757
  to_element: int = sys.maxsize,
2756
- labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2757
- xsize: int = 100,
2758
- ysize: int = 100,
2758
+ labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
2759
+ xsize: int = 500,
2760
+ ysize: int = 500,
2759
2761
  add_location: bool = True,
2760
2762
  add_content: bool = True,
2761
2763
  add_page_index: bool = True,
2762
2764
  # table specific flags
2763
2765
  add_table_cell_location: bool = False,
2764
- add_table_cell_label: bool = True,
2765
2766
  add_table_cell_text: bool = True,
2766
- # specifics
2767
- page_no: Optional[int] = None,
2768
- with_groups: bool = True,
2769
- newline: bool = True,
2770
2767
  ) -> str:
2771
2768
  r"""Exports the document content to a DocumentToken format.
2772
2769
 
2773
2770
  Operates on a slice of the document's body as defined through arguments
2774
2771
  from_element and to_element; defaulting to the whole main_text.
2775
2772
 
2776
- :param delim: str: (Default value = "\n\n")
2773
+ :param delim: str: (Default value = "")
2777
2774
  :param from_element: int: (Default value = 0)
2778
2775
  :param to_element: Optional[int]: (Default value = None)
2779
2776
  :param labels: set[DocItemLabel]
2780
- :param xsize: int: (Default value = 100)
2781
- :param ysize: int: (Default value = 100)
2777
+ :param xsize: int: (Default value = 500)
2778
+ :param ysize: int: (Default value = 500)
2782
2779
  :param add_location: bool: (Default value = True)
2783
2780
  :param add_content: bool: (Default value = True)
2784
2781
  :param add_page_index: bool: (Default value = True)
2785
2782
  :param # table specific flagsadd_table_cell_location: bool
2786
- :param add_table_cell_label: bool: (Default value = True)
2787
2783
  :param add_table_cell_text: bool: (Default value = True)
2788
2784
  :returns: The content of the document formatted as a DocTags string.
2789
2785
  :rtype: str
2790
2786
  """
2791
2787
 
2792
- def close_lists(
2793
- curr_level: int,
2794
- prev_level: int,
2795
- in_ordered_list: List[bool],
2796
- result: str,
2797
- delim: str,
2798
- ):
2799
-
2800
- if len(in_ordered_list) == 0:
2801
- return (in_ordered_list, result)
2802
-
2803
- while curr_level < prev_level and len(in_ordered_list) > 0:
2804
- if in_ordered_list[-1]:
2805
- result += f"</ordered_list>{delim}"
2788
+ def _close_lists(
2789
+ current_level: int,
2790
+ previous_level: int,
2791
+ ordered_list_stack: List[bool],
2792
+ output_parts: List[str],
2793
+ ) -> List[bool]:
2794
+ """Close open list tags until the nesting level matches item's level."""
2795
+ while current_level < previous_level and ordered_list_stack:
2796
+ last_is_ordered = ordered_list_stack.pop()
2797
+ if last_is_ordered:
2798
+ output_parts.append(f"</{DocumentToken.ORDERED_LIST.value}>\n")
2806
2799
  else:
2807
- result += f"</unordered_list>{delim}"
2808
-
2809
- prev_level -= 1
2810
- in_ordered_list.pop() # = in_ordered_list[:-1]
2811
-
2812
- return (in_ordered_list, result)
2813
-
2814
- if newline:
2815
- delim = "\n"
2816
- else:
2817
- delim = ""
2818
-
2819
- prev_level = 0 # Track the previous item's level
2820
-
2821
- in_ordered_list: List[bool] = [] # False
2822
-
2823
- result = f"{DocumentToken.BEG_DOCUMENT.value}{delim}"
2824
-
2825
- for ix, (item, curr_level) in enumerate(
2826
- self.iterate_items(self.body, with_groups=True)
2800
+ output_parts.append(f"</{DocumentToken.UNORDERED_LIST.value}>\n")
2801
+ previous_level -= 1
2802
+ return ordered_list_stack
2803
+
2804
+ def _add_page_break_if_needed(
2805
+ output_parts: List[str],
2806
+ item,
2807
+ prev_page_no,
2808
+ page_break_enabled: bool,
2827
2809
  ):
2828
-
2829
- # If we've moved to a lower level, we're exiting one or more groups
2830
- if curr_level < prev_level and len(in_ordered_list) > 0:
2831
- # Calculate how many levels we've exited
2832
- # level_difference = previous_level - level
2833
- # Decrement list_nesting_level for each list group we've exited
2834
- # list_nesting_level = max(0, list_nesting_level - level_difference)
2835
-
2836
- in_ordered_list, result = close_lists(
2837
- curr_level=curr_level,
2838
- prev_level=prev_level,
2839
- in_ordered_list=in_ordered_list,
2840
- result=result,
2841
- delim=delim,
2810
+ """Inserts a page-break token.
2811
+
2812
+ Inserts a page-break token if the item's page number is different
2813
+ from the previous item and page breaks are enabled.
2814
+ Returns the updated output_parts list and the current page number.
2815
+ """
2816
+ if not page_break_enabled:
2817
+ return output_parts, prev_page_no
2818
+
2819
+ if not item.prov:
2820
+ return output_parts, prev_page_no
2821
+
2822
+ current_page_no = item.prov[0].page_no
2823
+ if prev_page_no is None:
2824
+ return output_parts, current_page_no
2825
+
2826
+ if current_page_no != prev_page_no:
2827
+ output_parts.append(f"<{DocumentToken.PAGE_BREAK.value}>\n")
2828
+
2829
+ return output_parts, current_page_no
2830
+
2831
+ def _get_standalone_captions(document_body):
2832
+ """Identify captions that are not attached to any table or figure."""
2833
+ all_captions = set()
2834
+ matched_captions = set()
2835
+ for item, _ in self.iterate_items(document_body, with_groups=True):
2836
+ if item.label == DocItemLabel.CAPTION:
2837
+ all_captions.update([item.self_ref])
2838
+ if item.label in [DocItemLabel.PICTURE, DocItemLabel.TABLE]:
2839
+ matched_captions.update([caption.cref for caption in item.captions])
2840
+
2841
+ return all_captions - matched_captions
2842
+
2843
+ # Initialization
2844
+ output_parts: List[str] = []
2845
+ ordered_list_stack: List[bool] = []
2846
+ previous_level = 0
2847
+ previous_page_no = None
2848
+
2849
+ # Precompute standalone captions
2850
+ standalone_captions = _get_standalone_captions(self.body)
2851
+
2852
+ # Begin document
2853
+ output_parts.append(f"<{DocumentToken.DOCUMENT.value}>{delim}")
2854
+
2855
+ for ix, (item, current_level) in enumerate(
2856
+ self.iterate_items(
2857
+ self.body,
2858
+ with_groups=True,
2859
+ included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE},
2860
+ )
2861
+ ):
2862
+ # Close lists if we've moved to a lower nesting level
2863
+ if current_level < previous_level and ordered_list_stack:
2864
+ ordered_list_stack = _close_lists(
2865
+ current_level, previous_level, ordered_list_stack, output_parts
2842
2866
  )
2867
+ previous_level = current_level
2843
2868
 
2844
- prev_level = curr_level # Update previous_level for next iteration
2845
-
2846
- if ix < from_element or to_element <= ix:
2847
- continue # skip as many items as you want
2848
-
2849
- if (isinstance(item, DocItem)) and (item.label not in labels):
2850
- continue # skip any label that is not whitelisted
2851
-
2852
- if isinstance(item, GroupItem) and item.label in [
2853
- GroupLabel.ORDERED_LIST,
2854
- ]:
2869
+ # Skip items outside the specified element range
2870
+ if ix < from_element or ix >= to_element:
2871
+ continue
2855
2872
 
2856
- result += f"<ordered_list>{delim}"
2857
- in_ordered_list.append(True)
2873
+ # Skip items whose label is not in the allowed set
2874
+ if isinstance(item, DocItem) and (item.label not in labels):
2875
+ continue
2858
2876
 
2859
- elif isinstance(item, GroupItem) and item.label in [
2860
- GroupLabel.LIST,
2861
- ]:
2877
+ # Skip captions that are not standalone as they will be included below
2878
+ # by the export functions of Table and Picture
2879
+ if (
2880
+ isinstance(item, TextItem)
2881
+ and item.label == DocItemLabel.CAPTION
2882
+ and item.self_ref not in standalone_captions
2883
+ ):
2884
+ continue
2862
2885
 
2863
- result += f"<unordered_list>{delim}"
2864
- in_ordered_list.append(False)
2886
+ # Handle list groups
2887
+ if isinstance(item, GroupItem):
2888
+ if item.label == GroupLabel.ORDERED_LIST:
2889
+ output_parts.append(f"<{DocumentToken.ORDERED_LIST.value}>{delim}")
2890
+ ordered_list_stack.append(True)
2891
+ elif item.label == GroupLabel.LIST:
2892
+ output_parts.append(
2893
+ f"<{DocumentToken.UNORDERED_LIST.value}>{delim}"
2894
+ )
2895
+ ordered_list_stack.append(False)
2896
+ continue
2865
2897
 
2866
- elif isinstance(item, SectionHeaderItem):
2898
+ # For other item types, optionally insert page-break if the page changed
2899
+ output_parts, previous_page_no = _add_page_break_if_needed(
2900
+ output_parts, item, previous_page_no, add_page_index
2901
+ )
2867
2902
 
2868
- result += item.export_to_document_tokens(
2869
- doc=self,
2870
- new_line=delim,
2871
- xsize=xsize,
2872
- ysize=ysize,
2873
- add_location=add_location,
2874
- add_content=add_content,
2875
- add_page_index=add_page_index,
2903
+ if isinstance(item, SectionHeaderItem):
2904
+ output_parts.append(
2905
+ item.export_to_document_tokens(
2906
+ doc=self,
2907
+ new_line=delim,
2908
+ xsize=xsize,
2909
+ ysize=ysize,
2910
+ add_location=add_location,
2911
+ add_content=add_content,
2912
+ )
2876
2913
  )
2877
- elif isinstance(item, CodeItem) and (item.label in labels):
2878
-
2879
- result += item.export_to_document_tokens(
2880
- doc=self,
2881
- new_line=delim,
2882
- xsize=xsize,
2883
- ysize=ysize,
2884
- add_location=add_location,
2885
- add_content=add_content,
2886
- add_page_index=add_page_index,
2914
+ elif isinstance(item, CodeItem):
2915
+ output_parts.append(
2916
+ item.export_to_document_tokens(
2917
+ doc=self,
2918
+ new_line=delim,
2919
+ xsize=xsize,
2920
+ ysize=ysize,
2921
+ add_location=add_location,
2922
+ add_content=add_content,
2923
+ )
2887
2924
  )
2888
-
2889
- elif isinstance(item, TextItem) and (item.label in labels):
2890
-
2891
- result += item.export_to_document_tokens(
2892
- doc=self,
2893
- new_line=delim,
2894
- xsize=xsize,
2895
- ysize=ysize,
2896
- add_location=add_location,
2897
- add_content=add_content,
2898
- add_page_index=add_page_index,
2925
+ elif isinstance(item, TextItem):
2926
+ output_parts.append(
2927
+ item.export_to_document_tokens(
2928
+ doc=self,
2929
+ new_line=delim,
2930
+ xsize=xsize,
2931
+ ysize=ysize,
2932
+ add_location=add_location,
2933
+ add_content=add_content,
2934
+ )
2899
2935
  )
2900
-
2901
- elif isinstance(item, TableItem) and (item.label in labels):
2902
-
2903
- result += item.export_to_document_tokens(
2904
- doc=self,
2905
- new_line=delim,
2906
- xsize=xsize,
2907
- ysize=ysize,
2908
- add_caption=True,
2909
- add_location=add_location,
2910
- add_content=add_content,
2911
- add_cell_location=add_table_cell_location,
2912
- add_cell_label=add_table_cell_label,
2913
- add_cell_text=add_table_cell_text,
2914
- add_page_index=add_page_index,
2936
+ elif isinstance(item, TableItem):
2937
+ output_parts.append(
2938
+ item.export_to_document_tokens(
2939
+ doc=self,
2940
+ new_line=delim,
2941
+ xsize=xsize,
2942
+ ysize=ysize,
2943
+ add_location=add_location,
2944
+ add_cell_location=add_table_cell_location,
2945
+ add_cell_text=add_table_cell_text,
2946
+ add_caption=True,
2947
+ )
2915
2948
  )
2916
-
2917
- elif isinstance(item, PictureItem) and (item.label in labels):
2918
-
2919
- result += item.export_to_document_tokens(
2920
- doc=self,
2921
- new_line=delim,
2922
- xsize=xsize,
2923
- ysize=ysize,
2924
- add_caption=True,
2925
- add_location=add_location,
2926
- add_content=add_content,
2927
- add_page_index=add_page_index,
2949
+ elif isinstance(item, PictureItem):
2950
+ output_parts.append(
2951
+ item.export_to_document_tokens(
2952
+ doc=self,
2953
+ new_line=delim,
2954
+ xsize=xsize,
2955
+ ysize=ysize,
2956
+ add_caption=True,
2957
+ add_location=add_location,
2958
+ add_content=add_content,
2959
+ )
2928
2960
  )
2929
2961
 
2930
- result += DocumentToken.END_DOCUMENT.value
2962
+ # End any lists that might still be open
2963
+ ordered_list_stack = _close_lists(
2964
+ 0, previous_level, ordered_list_stack, output_parts
2965
+ )
2931
2966
 
2932
- return result
2967
+ # End document
2968
+ output_parts.append(f"</{DocumentToken.DOCUMENT.value}>")
2969
+
2970
+ return "".join(output_parts)
2933
2971
 
2934
2972
  def _export_to_indented_text(
2935
2973
  self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
@@ -111,7 +111,7 @@ class PictureClassificationLabel(str, Enum):
111
111
  SIGNATURE = "signature"
112
112
  STAMP = "stamp"
113
113
  QR_CODE = "qr_code"
114
- BAR_CODE = "bat_code"
114
+ BAR_CODE = "bar_code"
115
115
  SCREENSHOT = "screenshot"
116
116
 
117
117
  # Geology/Geography
@@ -8,13 +8,15 @@
8
8
  from enum import Enum
9
9
  from typing import Tuple
10
10
 
11
+ from docling_core.types.doc.labels import PictureClassificationLabel
12
+
11
13
 
12
14
  class TableToken(Enum):
13
15
  """Class to represent an LLM friendly representation of a Table."""
14
16
 
15
17
  CELL_LABEL_COLUMN_HEADER = "<column_header>"
16
18
  CELL_LABEL_ROW_HEADER = "<row_header>"
17
- CELL_LABEL_SECTION_HEADERE = "<section_header>"
19
+ CELL_LABEL_SECTION_HEADER = "<shed>"
18
20
  CELL_LABEL_DATA = "<data>"
19
21
 
20
22
  OTSL_ECEL = "<ecel>" # empty cell
@@ -42,83 +44,30 @@ class TableToken(Enum):
42
44
  class DocumentToken(Enum):
43
45
  """Class to represent an LLM friendly representation of a Document."""
44
46
 
45
- BEG_DOCUMENT = "<document>"
46
- END_DOCUMENT = "</document>"
47
-
48
- BEG_TITLE = "<title>"
49
- END_TITLE = "</title>"
50
-
51
- BEG_ABSTRACT = "<abstract>"
52
- END_ABSTRACT = "</abstract>"
53
-
54
- BEG_DOI = "<doi>"
55
- END_DOI = "</doi>"
56
- BEG_DATE = "<date>"
57
- END_DATE = "</date>"
58
-
59
- BEG_AUTHORS = "<authors>"
60
- END_AUTHORS = "</authors>"
61
- BEG_AUTHOR = "<author>"
62
- END_AUTHOR = "</author>"
63
-
64
- BEG_AFFILIATIONS = "<affiliations>"
65
- END_AFFILIATIONS = "</affiliations>"
66
- BEG_AFFILIATION = "<affiliation>"
67
- END_AFFILIATION = "</affiliation>"
68
-
69
- BEG_HEADER = "<section-header>"
70
- END_HEADER = "</section-header>"
71
- BEG_TEXT = "<text>"
72
- END_TEXT = "</text>"
73
- BEG_PARAGRAPH = "<paragraph>"
74
- END_PARAGRAPH = "</paragraph>"
75
- BEG_TABLE = "<table>"
76
- END_TABLE = "</table>"
77
- BEG_FIGURE = "<figure>"
78
- END_FIGURE = "</figure>"
79
- BEG_CAPTION = "<caption>"
80
- END_CAPTION = "</caption>"
81
- BEG_EQUATION = "<equation>"
82
- END_EQUATION = "</equation>"
83
- BEG_LIST = "<list>"
84
- END_LIST = "</list>"
85
- BEG_LISTITEM = "<list-item>"
86
- END_LISTITEM = "</list-item>"
87
-
88
- BEG_LOCATION = "<location>"
89
- END_LOCATION = "</location>"
90
- BEG_GROUP = "<group>"
91
- END_GROUP = "</group>"
47
+ DOCUMENT = "doctag"
48
+ OTSL = "otsl"
49
+ ORDERED_LIST = "ordered_list"
50
+ UNORDERED_LIST = "unordered_list"
51
+ LOC = "loc_"
52
+ PAGE_BREAK = "page_break"
92
53
 
93
54
  @classmethod
94
55
  def get_special_tokens(
95
56
  cls,
96
- max_rows: int = 100,
97
- max_cols: int = 100,
98
- max_pages: int = 1000,
99
57
  page_dimension: Tuple[int, int] = (100, 100),
100
58
  ):
101
59
  """Function to get all special document tokens."""
102
60
  special_tokens = [token.value for token in cls]
103
61
 
104
- # Adding dynamically generated row and col tokens
105
- for i in range(0, max_rows + 1):
106
- special_tokens += [f"<row_{i}>", f"</row_{i}>"]
107
-
108
- for i in range(0, max_cols + 1):
109
- special_tokens += [f"<col_{i}>", f"</col_{i}>"]
110
-
111
- for i in range(6):
112
- special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
113
-
114
- # FIXME: this is synonym of section header
115
62
  for i in range(6):
116
- special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
63
+ special_tokens += [
64
+ f"<section_header_level_{i}>",
65
+ f"</section_header_level_{i}>",
66
+ ]
117
67
 
118
- # Adding dynamically generated page-tokens
119
- for i in range(0, max_pages + 1):
120
- special_tokens.append(f"<page_{i}>")
121
- special_tokens.append(f"</page_{i}>")
68
+ # Add dynamically picture classification tokens
69
+ for _, member in PictureClassificationLabel.__members__.items():
70
+ special_tokens.append(f"<{member}>")
122
71
 
123
72
  # Adding dynamically generated location-tokens
124
73
  for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
@@ -132,25 +81,9 @@ class DocumentToken(Enum):
132
81
  return label in DocumentToken.get_special_tokens()
133
82
 
134
83
  @staticmethod
135
- def get_row_token(row: int, beg=bool) -> str:
136
- """Function to get page tokens."""
137
- if beg:
138
- return f"<row_{row}>"
139
- else:
140
- return f"</row_{row}>"
141
-
142
- @staticmethod
143
- def get_col_token(col: int, beg=bool) -> str:
144
- """Function to get page tokens."""
145
- if beg:
146
- return f"<col_{col}>"
147
- else:
148
- return f"</col_{col}>"
149
-
150
- @staticmethod
151
- def get_page_token(page: int):
152
- """Function to get page tokens."""
153
- return f"<page_{page}>"
84
+ def get_picture_classification_token(classification: str) -> str:
85
+ """Function to get picture classification tokens."""
86
+ return f"<{classification}>"
154
87
 
155
88
  @staticmethod
156
89
  def get_location_token(val: float, rnorm: int = 100):
@@ -172,7 +105,6 @@ class DocumentToken(Enum):
172
105
  page_h: float,
173
106
  xsize: int = 100,
174
107
  ysize: int = 100,
175
- page_i: int = -1,
176
108
  ):
177
109
  """Get the location string give bbox and page-dim."""
178
110
  assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
@@ -183,17 +115,11 @@ class DocumentToken(Enum):
183
115
  x1 = bbox[2] / page_w
184
116
  y1 = bbox[3] / page_h
185
117
 
186
- page_tok = ""
187
- if page_i != -1:
188
- page_tok = DocumentToken.get_page_token(page=page_i)
189
-
190
118
  x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
191
119
  y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
192
120
  x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
193
121
  y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
194
122
 
195
- loc_str = f"{DocumentToken.BEG_LOCATION.value}"
196
- loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
197
- loc_str += f"{DocumentToken.END_LOCATION.value}"
123
+ loc_str = f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
198
124
 
199
125
  return loc_str
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.18.1
3
+ Version: 2.19.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -18,15 +18,15 @@ docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75
18
18
  docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
19
19
  docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
20
20
  docling_core/transforms/chunker/base.py,sha256=BSWTiFOsF5YaZaZJZY8nwIdOXb9uufJMRIds7LxRNh8,2546
21
- docling_core/transforms/chunker/hierarchical_chunker.py,sha256=cy3sE9w_7l-uoIEUcfnZlQweDHUoyAJTQ6IkzxxVjFY,8052
21
+ docling_core/transforms/chunker/hierarchical_chunker.py,sha256=MStDUDtzFGc6j8v9AkcAnnSHTDxdoiVrp8FTmRdGqU8,8138
22
22
  docling_core/transforms/chunker/hybrid_chunker.py,sha256=kokjDdxjc_gygOokQwYFVnHv2NjWTgf9uex8o0ole7w,9876
23
23
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
24
24
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
25
25
  docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
26
26
  docling_core/types/doc/base.py,sha256=lMRNq1DUK7K26L2VNZRqFaItCSZ6m9BdYTVaJA98PZQ,11495
27
- docling_core/types/doc/document.py,sha256=Rn2hA0LPpnt7tGJOD2ME6t5x8R42mttPFD2Ks2cbvVU,102698
28
- docling_core/types/doc/labels.py,sha256=8Luymal9SKXTwyqq1ONKiUTxuMo_nRMYfBkRPFkdSSo,5306
29
- docling_core/types/doc/tokens.py,sha256=GMtm5TsNljBPaMYkgmD3WWZmC0FHqKF9imKEEySz4ps,6020
27
+ docling_core/types/doc/document.py,sha256=t1nk1GeR5_YvZhuWUVZkkBekp89vFB4RBtMuwD3Acw4,104373
28
+ docling_core/types/doc/labels.py,sha256=cqH4DGN9lgZns6gOtL5urzZzUPGOjHJ75xQbIKSh_h8,5306
29
+ docling_core/types/doc/tokens.py,sha256=i73PXkmqXCLsQ5SddnJX8L9e_Ub2_K_DYSE-VE8NDq0,3925
30
30
  docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
31
31
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
32
32
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
56
56
  docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
57
57
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
58
58
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
59
- docling_core-2.18.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
- docling_core-2.18.1.dist-info/METADATA,sha256=XV0FP0Uuqjra5G7O6ENd-0FwCc6TK6XasiXbHnCSGEA,5803
61
- docling_core-2.18.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
- docling_core-2.18.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
- docling_core-2.18.1.dist-info/RECORD,,
59
+ docling_core-2.19.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
+ docling_core-2.19.1.dist-info/METADATA,sha256=Uz-AUOD2_itxSEVxatsPbCQ0pFBE3fMX-gXx0YLmsKw,5803
61
+ docling_core-2.19.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
+ docling_core-2.19.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
+ docling_core-2.19.1.dist-info/RECORD,,