docling-core 1.6.3__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -0,0 +1,1194 @@
1
+ """Models for the Docling Document data type."""
2
+
3
+ import mimetypes
4
+ import re
5
+ import typing
6
+ from typing import Any, Dict, Final, List, Optional, Tuple, Union
7
+
8
+ import pandas as pd
9
+ from pydantic import (
10
+ AnyUrl,
11
+ BaseModel,
12
+ ConfigDict,
13
+ Field,
14
+ StringConstraints,
15
+ computed_field,
16
+ field_validator,
17
+ model_validator,
18
+ )
19
+ from tabulate import tabulate
20
+ from typing_extensions import Annotated
21
+
22
+ from docling_core.search.package import VERSION_PATTERN
23
+ from docling_core.types.doc.tokens import DocumentToken
24
+ from docling_core.types.experimental import BoundingBox, Size
25
+ from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
26
+
27
+ Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
28
+ LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
29
+ CURRENT_VERSION: Final = "1.0.0"
30
+
31
+ # (subset of) JSON Pointer URI fragment identifier format:
32
+ _JSON_POINTER_REGEX = r"^#(/[\w\-]+(/\d+)?)?$"
33
+
34
+
35
+ class BasePictureData(BaseModel): # TBD
36
+ """BasePictureData."""
37
+
38
+
39
+ class TableCell(BaseModel):
40
+ """TableCell."""
41
+
42
+ bbox: Optional[BoundingBox] = None
43
+ row_span: int = 1
44
+ col_span: int = 1
45
+ start_row_offset_idx: int
46
+ end_row_offset_idx: int
47
+ start_col_offset_idx: int
48
+ end_col_offset_idx: int
49
+ text: str
50
+ column_header: bool = False
51
+ row_header: bool = False
52
+ row_section: bool = False
53
+
54
+ @model_validator(mode="before")
55
+ @classmethod
56
+ def from_dict_format(cls, data: Any) -> Any:
57
+ """from_dict_format."""
58
+ if isinstance(data, Dict):
59
+ if "bbox" not in data or data["bbox"] is None:
60
+ return data
61
+ text = data["bbox"].get("token", "")
62
+ if not len(text):
63
+ text_cells = data.pop("text_cell_bboxes", None)
64
+ if text_cells:
65
+ for el in text_cells:
66
+ text += el["token"] + " "
67
+
68
+ text = text.strip()
69
+ data["text"] = text
70
+
71
+ return data
72
+
73
+
74
+ class BaseTableData(BaseModel): # TBD
75
+ """BaseTableData."""
76
+
77
+ table_cells: List[TableCell] = []
78
+ num_rows: int = 0
79
+ num_cols: int = 0
80
+
81
+ @computed_field # type: ignore
82
+ @property
83
+ def grid(
84
+ self,
85
+ ) -> List[List[TableCell]]:
86
+ """grid."""
87
+ # Initialise empty table data grid (only empty cells)
88
+ table_data = [
89
+ [
90
+ TableCell(
91
+ text="",
92
+ start_row_offset_idx=i,
93
+ end_row_offset_idx=i + 1,
94
+ start_col_offset_idx=j,
95
+ end_col_offset_idx=j + 1,
96
+ )
97
+ for j in range(self.num_cols)
98
+ ]
99
+ for i in range(self.num_rows)
100
+ ]
101
+
102
+ # Overwrite cells in table data for which there is actual cell content.
103
+ for cell in self.table_cells:
104
+ for i in range(
105
+ min(cell.start_row_offset_idx, self.num_rows),
106
+ min(cell.end_row_offset_idx, self.num_rows),
107
+ ):
108
+ for j in range(
109
+ min(cell.start_col_offset_idx, self.num_cols),
110
+ min(cell.end_col_offset_idx, self.num_cols),
111
+ ):
112
+ table_data[i][j] = cell
113
+
114
+ return table_data
115
+
116
+
117
+ class DocumentOrigin(BaseModel):
118
+ """FileSource."""
119
+
120
+ mimetype: str # the mimetype of the original file
121
+ binary_hash: Uint64 # the binary hash of the original file.
122
+ # TODO: Change to be Uint64 and provide utility method to generate
123
+
124
+ filename: str # The name of the original file, including extension, without path.
125
+ # Could stem from filesystem, source URI, Content-Disposition header, ...
126
+
127
+ uri: Optional[AnyUrl] = (
128
+ None # any possible reference to a source file,
129
+ # from any file handler protocol (e.g. https://, file://, s3://)
130
+ )
131
+
132
+ @field_validator("binary_hash", mode="before")
133
+ @classmethod
134
+ def parse_hex_string(cls, value):
135
+ """parse_hex_string."""
136
+ if isinstance(value, str):
137
+ try:
138
+ # Convert hex string to an integer
139
+ hash_int = Uint64(value, 16)
140
+ # Mask to fit within 64 bits (unsigned)
141
+ return (
142
+ hash_int & 0xFFFFFFFFFFFFFFFF
143
+ ) # TODO be sure it doesn't clip uint64 max
144
+ except ValueError:
145
+ raise ValueError(f"Invalid sha256 hexdigest: {value}")
146
+ return value # If already an int, return it as is.
147
+
148
+ @field_validator("mimetype")
149
+ @classmethod
150
+ def validate_mimetype(cls, v):
151
+ """validate_mimetype."""
152
+ # Check if the provided MIME type is valid using mimetypes module
153
+ if v not in mimetypes.types_map.values():
154
+ raise ValueError(f"'{v}' is not a valid MIME type")
155
+ return v
156
+
157
+
158
+ class RefItem(BaseModel):
159
+ """RefItem."""
160
+
161
+ cref: str = Field(alias="$ref", pattern=_JSON_POINTER_REGEX)
162
+
163
+ # This method makes RefItem compatible with DocItem
164
+ def get_ref(self):
165
+ """get_ref."""
166
+ return self
167
+
168
+ model_config = ConfigDict(
169
+ populate_by_name=True,
170
+ )
171
+
172
+ def resolve(self, doc: "DoclingDocument"):
173
+ """resolve."""
174
+ path_components = self.cref.split("/")
175
+ if (num_comps := len(path_components)) == 3:
176
+ _, path, index_str = path_components
177
+ index = int(index_str)
178
+ obj = doc.__getattribute__(path)[index]
179
+ elif num_comps == 2:
180
+ _, path = path_components
181
+ obj = doc.__getattribute__(path)
182
+ else:
183
+ raise RuntimeError(f"Unsupported number of path components: {num_comps}")
184
+ return obj
185
+
186
+
187
+ class ImageRef(BaseModel):
188
+ """ImageRef."""
189
+
190
+ mimetype: str
191
+ dpi: int
192
+ size: Size
193
+ uri: AnyUrl
194
+
195
+ @field_validator("mimetype")
196
+ @classmethod
197
+ def validate_mimetype(cls, v):
198
+ """validate_mimetype."""
199
+ # Check if the provided MIME type is valid using mimetypes module
200
+ if v not in mimetypes.types_map.values():
201
+ raise ValueError(f"'{v}' is not a valid MIME type")
202
+ return v
203
+
204
+
205
+ class ProvenanceItem(BaseModel):
206
+ """ProvenanceItem."""
207
+
208
+ page_no: int
209
+ bbox: BoundingBox
210
+ charspan: Tuple[int, int]
211
+
212
+
213
+ class NodeItem(BaseModel):
214
+ """NodeItem."""
215
+
216
+ self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
217
+ parent: Optional[RefItem] = None
218
+ children: List[RefItem] = []
219
+
220
+ model_config = ConfigDict(extra="forbid")
221
+
222
+ def get_ref(self):
223
+ """get_ref."""
224
+ return RefItem(cref=self.self_ref)
225
+
226
+
227
+ class GroupItem(NodeItem): # Container type, can't be a leaf node
228
+ """GroupItem."""
229
+
230
+ name: str = (
231
+ "group" # Name of the group, e.g. "Introduction Chapter",
232
+ # "Slide 5", "Navigation menu list", ...
233
+ )
234
+ label: GroupLabel = GroupLabel.UNSPECIFIED
235
+
236
+
237
+ class DocItem(
238
+ NodeItem
239
+ ): # Base type for any element that carries content, can be a leaf node
240
+ """DocItem."""
241
+
242
+ label: DocItemLabel
243
+ prov: List[ProvenanceItem] = []
244
+
245
+ def get_location_tokens(
246
+ self,
247
+ doc: "DoclingDocument",
248
+ new_line: str,
249
+ xsize: int = 100,
250
+ ysize: int = 100,
251
+ add_page_index: bool = True,
252
+ ) -> str:
253
+ """Get the location string for the BaseCell."""
254
+ if not len(self.prov):
255
+ return ""
256
+
257
+ location = ""
258
+ for prov in self.prov:
259
+ page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
260
+
261
+ page_i = -1
262
+ if add_page_index:
263
+ page_i = prov.page_no
264
+
265
+ loc_str = DocumentToken.get_location(
266
+ bbox=prov.bbox.to_bottom_left_origin(page_h).as_tuple(),
267
+ page_w=page_w,
268
+ page_h=page_h,
269
+ xsize=xsize,
270
+ ysize=ysize,
271
+ page_i=page_i,
272
+ )
273
+ location += f"{loc_str}{new_line}"
274
+
275
+ return location
276
+
277
+
278
+ class TextItem(DocItem):
279
+ """TextItem."""
280
+
281
+ orig: str # untreated representation
282
+ text: str # sanitized representation
283
+
284
+ def export_to_document_tokens(
285
+ self,
286
+ doc: "DoclingDocument",
287
+ new_line: str = "\n",
288
+ xsize: int = 100,
289
+ ysize: int = 100,
290
+ add_location: bool = True,
291
+ add_content: bool = True,
292
+ add_page_index: bool = True,
293
+ ):
294
+ r"""Export text element to document tokens format.
295
+
296
+ :param doc: "DoclingDocument":
297
+ :param new_line: str: (Default value = "\n")
298
+ :param xsize: int: (Default value = 100)
299
+ :param ysize: int: (Default value = 100)
300
+ :param add_location: bool: (Default value = True)
301
+ :param add_content: bool: (Default value = True)
302
+ :param add_page_index: bool: (Default value = True)
303
+
304
+ """
305
+ body = f"<{self.label.value}>"
306
+
307
+ # TODO: This must be done through an explicit mapping.
308
+ # assert DocumentToken.is_known_token(
309
+ # body
310
+ # ), f"failed DocumentToken.is_known_token({body})"
311
+
312
+ if add_location:
313
+ body += self.get_location_tokens(
314
+ doc=doc,
315
+ new_line="",
316
+ xsize=xsize,
317
+ ysize=ysize,
318
+ add_page_index=add_page_index,
319
+ )
320
+
321
+ if add_content and self.text is not None:
322
+ body += self.text.strip()
323
+
324
+ body += f"</{self.label.value}>{new_line}"
325
+
326
+ return body
327
+
328
+
329
+ class SectionHeaderItem(TextItem):
330
+ """SectionItem."""
331
+
332
+ label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
333
+ level: LevelNumber
334
+
335
+
336
+ class FloatingItem(DocItem):
337
+ """FloatingItem."""
338
+
339
+ captions: List[RefItem] = []
340
+ references: List[RefItem] = []
341
+ footnotes: List[RefItem] = []
342
+ image: Optional[ImageRef] = None
343
+
344
+
345
+ class PictureItem(FloatingItem):
346
+ """PictureItem."""
347
+
348
+ label: typing.Literal[DocItemLabel.PICTURE] = DocItemLabel.PICTURE
349
+
350
+ data: BasePictureData
351
+
352
+ def export_to_document_tokens(
353
+ self,
354
+ doc: "DoclingDocument",
355
+ new_line: str = "\n",
356
+ xsize: int = 100,
357
+ ysize: int = 100,
358
+ add_location: bool = True,
359
+ add_caption: bool = True,
360
+ add_content: bool = True, # not used at the moment
361
+ add_page_index: bool = True,
362
+ ):
363
+ r"""Export picture to document tokens format.
364
+
365
+ :param doc: "DoclingDocument":
366
+ :param new_line: str: (Default value = "\n")
367
+ :param xsize: int: (Default value = 100)
368
+ :param ysize: int: (Default value = 100)
369
+ :param add_location: bool: (Default value = True)
370
+ :param add_caption: bool: (Default value = True)
371
+ :param add_content: bool: (Default value = True)
372
+ :param # not used at the momentadd_page_index: bool: (Default value = True)
373
+
374
+ """
375
+ body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
376
+
377
+ if add_location:
378
+ body += self.get_location_tokens(
379
+ doc=doc,
380
+ new_line=new_line,
381
+ xsize=xsize,
382
+ ysize=ysize,
383
+ add_page_index=add_page_index,
384
+ )
385
+
386
+ if add_caption and len(self.captions):
387
+ text = ""
388
+ for cap in self.captions:
389
+ text += cap.resolve(doc).text
390
+
391
+ if len(text):
392
+ body += f"{DocumentToken.BEG_CAPTION.value}"
393
+ body += f"{text.strip()}"
394
+ body += f"{DocumentToken.END_CAPTION.value}"
395
+ body += f"{new_line}"
396
+
397
+ body += f"{DocumentToken.END_FIGURE.value}{new_line}"
398
+
399
+ return body
400
+
401
+
402
+ class TableItem(FloatingItem):
403
+ """TableItem."""
404
+
405
+ data: BaseTableData
406
+ label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
407
+
408
+ def export_to_dataframe(self) -> pd.DataFrame:
409
+ """Export the table as a Pandas DataFrame."""
410
+ if self.data is None or self.data.num_rows == 0 or self.data.num_cols == 0:
411
+ return pd.DataFrame()
412
+
413
+ # Count how many rows are column headers
414
+ num_headers = 0
415
+ for i, row in enumerate(self.data.grid):
416
+ if len(row) == 0:
417
+ raise RuntimeError(
418
+ f"Invalid table. {len(row)=} but {self.data.num_cols=}."
419
+ )
420
+
421
+ any_header = False
422
+ for cell in row:
423
+ if cell.column_header:
424
+ any_header = True
425
+ break
426
+
427
+ if any_header:
428
+ num_headers += 1
429
+ else:
430
+ break
431
+
432
+ # Create the column names from all col_headers
433
+ columns: Optional[List[str]] = None
434
+ if num_headers > 0:
435
+ columns = ["" for _ in range(self.data.num_cols)]
436
+ for i in range(num_headers):
437
+ for j, cell in enumerate(self.data.grid[i]):
438
+ col_name = cell.text
439
+ if columns[j] != "":
440
+ col_name = f".{col_name}"
441
+ columns[j] += col_name
442
+
443
+ # Create table data
444
+ table_data = [
445
+ [cell.text for cell in row] for row in self.data.grid[num_headers:]
446
+ ]
447
+
448
+ # Create DataFrame
449
+ df = pd.DataFrame(table_data, columns=columns)
450
+
451
+ return df
452
+
453
+ def export_to_html(self) -> str:
454
+ """Export the table as html."""
455
+ body = ""
456
+ nrows = self.data.num_rows
457
+ ncols = self.data.num_cols
458
+
459
+ if not len(self.data.table_cells):
460
+ return ""
461
+ for i in range(nrows):
462
+ body += "<tr>"
463
+ for j in range(ncols):
464
+ cell: TableCell = self.data.grid[i][j]
465
+
466
+ rowspan, rowstart = (
467
+ cell.row_span,
468
+ cell.start_row_offset_idx,
469
+ )
470
+ colspan, colstart = (
471
+ cell.col_span,
472
+ cell.start_col_offset_idx,
473
+ )
474
+
475
+ if rowstart != i:
476
+ continue
477
+ if colstart != j:
478
+ continue
479
+
480
+ content = cell.text.strip()
481
+ celltag = "td"
482
+ if cell.column_header:
483
+ celltag = "th"
484
+
485
+ opening_tag = f"{celltag}"
486
+ if rowspan > 1:
487
+ opening_tag += f' rowspan="{rowspan}"'
488
+ if colspan > 1:
489
+ opening_tag += f' colspan="{colspan}"'
490
+
491
+ body += f"<{opening_tag}>{content}</{celltag}>"
492
+ body += "</tr>"
493
+ body = f"<table>{body}</table>"
494
+
495
+ return body
496
+
497
+ def export_to_document_tokens(
498
+ self,
499
+ doc: "DoclingDocument",
500
+ new_line: str = "\n",
501
+ xsize: int = 100,
502
+ ysize: int = 100,
503
+ add_location: bool = True,
504
+ add_caption: bool = True,
505
+ add_content: bool = True,
506
+ add_cell_location: bool = True,
507
+ add_cell_label: bool = True,
508
+ add_cell_text: bool = True,
509
+ add_page_index: bool = True,
510
+ ):
511
+ r"""Export table to document tokens format.
512
+
513
+ :param doc: "DoclingDocument":
514
+ :param new_line: str: (Default value = "\n")
515
+ :param xsize: int: (Default value = 100)
516
+ :param ysize: int: (Default value = 100)
517
+ :param add_location: bool: (Default value = True)
518
+ :param add_caption: bool: (Default value = True)
519
+ :param add_content: bool: (Default value = True)
520
+ :param add_cell_location: bool: (Default value = True)
521
+ :param add_cell_label: bool: (Default value = True)
522
+ :param add_cell_text: bool: (Default value = True)
523
+ :param add_page_index: bool: (Default value = True)
524
+
525
+ """
526
+ body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
527
+
528
+ if add_location:
529
+ body += self.get_location_tokens(
530
+ doc=doc,
531
+ new_line=new_line,
532
+ xsize=xsize,
533
+ ysize=ysize,
534
+ add_page_index=add_page_index,
535
+ )
536
+
537
+ if add_caption and len(self.captions):
538
+ text = ""
539
+ for cap in self.captions:
540
+ text += cap.resolve(doc).text
541
+
542
+ if len(text):
543
+ body += f"{DocumentToken.BEG_CAPTION.value}"
544
+ body += f"{text.strip()}"
545
+ body += f"{DocumentToken.END_CAPTION.value}"
546
+ body += f"{new_line}"
547
+
548
+ if add_content and len(self.data.table_cells) > 0:
549
+ for i, row in enumerate(self.data.grid):
550
+ body += f"<row_{i}>"
551
+ for j, col in enumerate(row):
552
+
553
+ text = ""
554
+ if add_cell_text:
555
+ text = col.text.strip()
556
+
557
+ cell_loc = ""
558
+ if (
559
+ col.bbox is not None
560
+ and add_cell_location
561
+ and add_page_index
562
+ and len(self.prov) > 0
563
+ ):
564
+ page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
565
+ cell_loc = DocumentToken.get_location(
566
+ bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
567
+ page_w=page_w,
568
+ page_h=page_h,
569
+ xsize=xsize,
570
+ ysize=ysize,
571
+ page_i=self.prov[0].page_no,
572
+ )
573
+ elif (
574
+ col.bbox is not None
575
+ and add_cell_location
576
+ and not add_page_index
577
+ and len(self.prov) > 0
578
+ ):
579
+ page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
580
+
581
+ cell_loc = DocumentToken.get_location(
582
+ bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
583
+ page_w=page_w,
584
+ page_h=page_h,
585
+ xsize=xsize,
586
+ ysize=ysize,
587
+ page_i=-1,
588
+ )
589
+
590
+ cell_label = ""
591
+ if add_cell_label:
592
+ if col.column_header:
593
+ cell_label = "<col_header>"
594
+ elif col.row_header:
595
+ cell_label = "<row_header>"
596
+ elif col.row_section:
597
+ cell_label = "<row_section>"
598
+ else:
599
+ cell_label = "<body>"
600
+
601
+ body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
602
+
603
+ body += f"</row_{i}>{new_line}"
604
+
605
+ body += f"{DocumentToken.END_TABLE.value}{new_line}"
606
+
607
+ return body
608
+
609
+
610
+ class KeyValueItem(DocItem):
611
+ """KeyValueItem."""
612
+
613
+
614
+ ContentItem = Union[TextItem, SectionHeaderItem, PictureItem, TableItem, KeyValueItem]
615
+
616
+
617
+ class PageItem(BaseModel):
618
+ """PageItem."""
619
+
620
+ # A page carries separate root items for furniture and body,
621
+ # only referencing items on the page
622
+ size: Size
623
+ image: Optional[ImageRef] = None
624
+ page_no: int
625
+
626
+
627
+ class DescriptionItem(BaseModel):
628
+ """DescriptionItem."""
629
+
630
+
631
+ class DoclingDocument(BaseModel):
632
+ """DoclingDocument."""
633
+
634
+ schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
635
+ version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
636
+ CURRENT_VERSION
637
+ )
638
+ description: DescriptionItem
639
+ name: str # The working name of this document, without extensions
640
+ # (could be taken from originating doc, or just "Untitled 1")
641
+ origin: Optional[DocumentOrigin] = (
642
+ None # DoclingDocuments may specify an origin (converted to DoclingDocument).
643
+ # This is optional, e.g. a DoclingDocument could also be entirely
644
+ # generated from synthetic data.
645
+ )
646
+
647
+ furniture: GroupItem = GroupItem(
648
+ name="_root_", self_ref="#/furniture"
649
+ ) # List[RefItem] = []
650
+ body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
651
+
652
+ groups: List[GroupItem] = []
653
+ texts: List[Union[SectionHeaderItem, TextItem]] = []
654
+ pictures: List[PictureItem] = []
655
+ tables: List[TableItem] = []
656
+ key_value_items: List[KeyValueItem] = []
657
+
658
+ pages: Dict[int, PageItem] = {} # empty as default
659
+
660
+ def add_group(
661
+ self,
662
+ label: Optional[GroupLabel] = None,
663
+ name: Optional[str] = None,
664
+ parent: Optional[GroupItem] = None,
665
+ ) -> GroupItem:
666
+ """add_group.
667
+
668
+ :param label: Optional[GroupLabel]: (Default value = None)
669
+ :param name: Optional[str]: (Default value = None)
670
+ :param parent: Optional[GroupItem]: (Default value = None)
671
+
672
+ """
673
+ if not parent:
674
+ parent = self.body
675
+
676
+ group_index = len(self.groups)
677
+ cref = f"#/groups/{group_index}"
678
+
679
+ group = GroupItem(self_ref=cref, parent=parent.get_ref())
680
+ if name is not None:
681
+ group.name = name
682
+ if label is not None:
683
+ group.label = label
684
+
685
+ self.groups.append(group)
686
+ parent.children.append(RefItem(cref=cref))
687
+
688
+ return group
689
+
690
+ def add_text(
691
+ self,
692
+ label: str,
693
+ text: str,
694
+ orig: Optional[str] = None,
695
+ prov: Optional[ProvenanceItem] = None,
696
+ parent: Optional[GroupItem] = None,
697
+ ):
698
+ """add_paragraph.
699
+
700
+ :param label: str:
701
+ :param text: str:
702
+ :param orig: Optional[str]: (Default value = None)
703
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
704
+ :param parent: Optional[GroupItem]: (Default value = None)
705
+
706
+ """
707
+ if not parent:
708
+ parent = self.body
709
+
710
+ if not orig:
711
+ orig = text
712
+
713
+ text_index = len(self.texts)
714
+ cref = f"#/texts/{text_index}"
715
+ text_item = TextItem(
716
+ label=label,
717
+ text=text,
718
+ orig=orig,
719
+ self_ref=cref,
720
+ parent=parent.get_ref(),
721
+ )
722
+ if prov:
723
+ text_item.prov.append(prov)
724
+
725
+ self.texts.append(text_item)
726
+ parent.children.append(RefItem(cref=cref))
727
+
728
+ return text_item
729
+
730
+ def add_table(
731
+ self,
732
+ data: BaseTableData,
733
+ caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
734
+ prov: Optional[ProvenanceItem] = None,
735
+ parent: Optional[GroupItem] = None,
736
+ ):
737
+ """add_table.
738
+
739
+ :param data: BaseTableData:
740
+ :param caption: Optional[Union[TextItem:
741
+ :param RefItem]]: (Default value = None)
742
+ :param # This is not cool yet.prov: Optional[ProvenanceItem]
743
+ :param parent: Optional[GroupItem]: (Default value = None)
744
+
745
+ """
746
+ if not parent:
747
+ parent = self.body
748
+
749
+ table_index = len(self.tables)
750
+ cref = f"#/tables/{table_index}"
751
+
752
+ tbl_item = TableItem(
753
+ label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
754
+ )
755
+ if prov:
756
+ tbl_item.prov.append(prov)
757
+ if caption:
758
+ tbl_item.captions.append(caption.get_ref())
759
+
760
+ self.tables.append(tbl_item)
761
+ parent.children.append(RefItem(cref=cref))
762
+
763
+ return tbl_item
764
+
765
+ def add_picture(
766
+ self,
767
+ data: BasePictureData,
768
+ caption: Optional[Union[TextItem, RefItem]] = None,
769
+ prov: Optional[ProvenanceItem] = None,
770
+ parent: Optional[GroupItem] = None,
771
+ ):
772
+ """add_picture.
773
+
774
+ :param data: BasePictureData:
775
+ :param caption: Optional[Union[TextItem:
776
+ :param RefItem]]: (Default value = None)
777
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
778
+ :param parent: Optional[GroupItem]: (Default value = None)
779
+
780
+ """
781
+ if not parent:
782
+ parent = self.body
783
+
784
+ picture_index = len(self.pictures)
785
+ cref = f"#/pictures/{picture_index}"
786
+
787
+ fig_item = PictureItem(
788
+ label=DocItemLabel.PICTURE,
789
+ data=data,
790
+ self_ref=cref,
791
+ parent=parent.get_ref(),
792
+ )
793
+ if prov:
794
+ fig_item.prov.append(prov)
795
+ if caption:
796
+ fig_item.captions.append(caption.get_ref())
797
+
798
+ self.pictures.append(fig_item)
799
+ parent.children.append(RefItem(cref=cref))
800
+
801
+ return fig_item
802
+
803
+ def add_heading(
804
+ self,
805
+ text: str,
806
+ orig: Optional[str] = None,
807
+ level: LevelNumber = 1,
808
+ prov: Optional[ProvenanceItem] = None,
809
+ parent: Optional[GroupItem] = None,
810
+ ):
811
+ """add_heading.
812
+
813
+ :param label: DocItemLabel:
814
+ :param text: str:
815
+ :param orig: Optional[str]: (Default value = None)
816
+ :param level: LevelNumber: (Default value = 1)
817
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
818
+ :param parent: Optional[GroupItem]: (Default value = None)
819
+
820
+ """
821
+ if not parent:
822
+ parent = self.body
823
+
824
+ if not orig:
825
+ orig = text
826
+
827
+ text_index = len(self.texts)
828
+ cref = f"#/texts/{text_index}"
829
+ section_header_item = SectionHeaderItem(
830
+ level=level,
831
+ text=text,
832
+ orig=orig,
833
+ self_ref=cref,
834
+ parent=parent.get_ref(),
835
+ )
836
+ if prov:
837
+ section_header_item.prov.append(prov)
838
+
839
+ self.texts.append(section_header_item)
840
+ parent.children.append(RefItem(cref=cref))
841
+
842
+ return section_header_item
843
+
844
+ def num_pages(self):
845
+ """num_pages."""
846
+ return len(self.pages.values())
847
+
848
+ def validate_tree(self, root) -> bool:
849
+ """validate_tree."""
850
+ res = []
851
+ for child_ref in root.children:
852
+ child = child_ref.resolve(self)
853
+ if child.parent.resolve(self) != root:
854
+ return False
855
+ res.append(self.validate_tree(child))
856
+
857
+ return all(res) or len(res) == 0
858
+
859
+ def iterate_items(
860
+ self,
861
+ root: Optional[NodeItem] = None,
862
+ with_groups: bool = False,
863
+ traverse_pictures: bool = True,
864
+ page_no: Optional[int] = None,
865
+ _level: int = 0, # fixed parameter, carries through the node nesting level
866
+ ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
867
+ """iterate_elements.
868
+
869
+ :param root: Optional[NodeItem]: (Default value = None)
870
+ :param with_groups: bool: (Default value = False)
871
+ :param traverse_pictures: bool: (Default value = True)
872
+ :param page_no: Optional[int]: (Default value = None)
873
+ :param _level: (Default value = 0)
874
+ :param # fixed parameter:
875
+ :param carries through the node nesting level:
876
+ """
877
+ if not root:
878
+ root = self.body
879
+
880
+ if not isinstance(root, GroupItem) or with_groups:
881
+ if isinstance(root, DocItem):
882
+ if page_no is not None:
883
+ for prov in root.prov:
884
+ if prov.page_no == page_no:
885
+ yield root, _level
886
+ else:
887
+ yield root, _level
888
+ else:
889
+ yield root, _level
890
+
891
+ # Traverse children
892
+ for child_ref in root.children:
893
+ child = child_ref.resolve(self)
894
+
895
+ if isinstance(child, NodeItem):
896
+ # If the child is a NodeItem, recursively traverse it
897
+ if not isinstance(child, PictureItem) or traverse_pictures:
898
+ yield from self.iterate_items(
899
+ child, _level=_level + 1, with_groups=with_groups
900
+ )
901
+
902
+ def print_element_tree(self):
903
+ """print_element_tree."""
904
+ for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
905
+ if isinstance(item, GroupItem):
906
+ print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
907
+ elif isinstance(item, DocItem):
908
+ print(" " * level, f"{ix}: {item.label.value}")
909
+
910
+ def export_to_markdown(
911
+ self,
912
+ delim: str = "\n\n",
913
+ from_element: int = 0,
914
+ to_element: Optional[int] = None,
915
+ labels: list[DocItemLabel] = [
916
+ DocItemLabel.TITLE,
917
+ DocItemLabel.SECTION_HEADER,
918
+ DocItemLabel.PARAGRAPH,
919
+ DocItemLabel.CAPTION,
920
+ DocItemLabel.TABLE,
921
+ DocItemLabel.TEXT,
922
+ ],
923
+ strict_text: bool = False,
924
+ ) -> str:
925
+ r"""Serialize to Markdown.
926
+
927
+ Operates on a slice of the document's main_text as defined through arguments
928
+ main_text_start and main_text_stop; defaulting to the whole main_text.
929
+
930
+ :param delim: Delimiter to use when concatenating the various
931
+ Markdown parts. Defaults to "\n\n".
932
+ :type delim: str
933
+ :param from_element: Body slicing start index (inclusive).
934
+ Defaults to 0.
935
+ :type from_element: int
936
+ :param to_element: Body slicing stop index
937
+ (exclusive). Defaults to None.
938
+ :type to_element: Optional[int]
939
+ :param delim: str: (Default value = "\n\n")
940
+ :param from_element: int: (Default value = 0)
941
+ :param to_element: Optional[int]: (Default value = None)
942
+ :param labels: list[DocItemLabel]
943
+ :param "subtitle-level-1":
944
+ :param "paragraph":
945
+ :param "caption":
946
+ :param "table":
947
+ :param "Text":
948
+ :param "text":
949
+ :param ]:
950
+ :param strict_text: bool: (Default value = False)
951
+ :returns: The exported Markdown representation.
952
+ :rtype: str
953
+ """
954
+ has_title = False
955
+ prev_text = ""
956
+ md_texts: list[str] = []
957
+
958
+ skip_count = 0
959
+ for ix, (item, level) in enumerate(self.iterate_items(self.body)):
960
+ if skip_count < from_element:
961
+ skip_count += 1
962
+ continue # skip as many items as you want
963
+
964
+ if to_element and ix >= to_element:
965
+ break
966
+
967
+ markdown_text = ""
968
+
969
+ if isinstance(item, DocItem):
970
+ item_type = item.label
971
+
972
+ if isinstance(item, TextItem) and item_type in labels:
973
+ text = item.text
974
+
975
+ # ignore repeated text
976
+ if prev_text == text or text is None:
977
+ continue
978
+ else:
979
+ prev_text = text
980
+
981
+ # first title match
982
+ if item_type == "title" and not has_title:
983
+ if strict_text:
984
+ markdown_text = f"{text}"
985
+ else:
986
+ markdown_text = f"# {text}"
987
+ has_title = True
988
+
989
+ # secondary titles
990
+ elif item_type in {"title", "subtitle-level-1"} or (
991
+ has_title and item_type == "title"
992
+ ):
993
+ if strict_text:
994
+ markdown_text = f"{text}"
995
+ else:
996
+ markdown_text = f"## {text}"
997
+
998
+ # normal text
999
+ else:
1000
+ markdown_text = text
1001
+
1002
+ elif (
1003
+ isinstance(item, TableItem)
1004
+ and item.data
1005
+ and item_type in labels
1006
+ and not strict_text
1007
+ ):
1008
+ table = []
1009
+ for row in item.data.grid:
1010
+ tmp = []
1011
+ for col in row:
1012
+ tmp.append(col.text)
1013
+ table.append(tmp)
1014
+
1015
+ if len(table) > 1 and len(table[0]) > 0:
1016
+ try:
1017
+ md_table = tabulate(
1018
+ table[1:], headers=table[0], tablefmt="github"
1019
+ )
1020
+ except ValueError:
1021
+ md_table = tabulate(
1022
+ table[1:],
1023
+ headers=table[0],
1024
+ tablefmt="github",
1025
+ disable_numparse=True,
1026
+ )
1027
+
1028
+ markdown_text = md_table
1029
+
1030
+ if markdown_text:
1031
+ md_texts.append(markdown_text)
1032
+
1033
+ result = delim.join(md_texts)
1034
+ return result
1035
+
1036
+ def export_to_document_tokens(
1037
+ self,
1038
+ delim: str = "\n\n",
1039
+ from_element: int = 0,
1040
+ to_element: Optional[int] = None,
1041
+ labels: list[DocItemLabel] = [
1042
+ DocItemLabel.TITLE,
1043
+ DocItemLabel.SECTION_HEADER,
1044
+ DocItemLabel.PARAGRAPH,
1045
+ DocItemLabel.CAPTION,
1046
+ DocItemLabel.TABLE,
1047
+ DocItemLabel.TEXT,
1048
+ ],
1049
+ xsize: int = 100,
1050
+ ysize: int = 100,
1051
+ add_location: bool = True,
1052
+ add_content: bool = True,
1053
+ add_page_index: bool = True,
1054
+ # table specific flags
1055
+ add_table_cell_location: bool = False,
1056
+ add_table_cell_label: bool = True,
1057
+ add_table_cell_text: bool = True,
1058
+ ) -> str:
1059
+ r"""Exports the document content to an DocumentToken format.
1060
+
1061
+ Operates on a slice of the document's body as defined through arguments
1062
+ from_element and to_element; defaulting to the whole main_text.
1063
+
1064
+ :param delim: str: (Default value = "\n\n")
1065
+ :param from_element: int: (Default value = 0)
1066
+ :param to_element: Optional[int]: (Default value = None)
1067
+ :param labels: list[DocItemLabel]
1068
+ :param xsize: int: (Default value = 100)
1069
+ :param ysize: int: (Default value = 100)
1070
+ :param add_location: bool: (Default value = True)
1071
+ :param add_content: bool: (Default value = True)
1072
+ :param add_page_index: bool: (Default value = True)
1073
+ :param # table specific flagsadd_table_cell_location: bool
1074
+ :param add_table_cell_label: bool: (Default value = True)
1075
+ :param add_table_cell_text: bool: (Default value = True)
1076
+ :returns: The content of the document formatted as a DocTags string.
1077
+ :rtype: str
1078
+ """
1079
+ new_line = ""
1080
+ if delim:
1081
+ new_line = "\n"
1082
+
1083
+ doctags = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
1084
+
1085
+ # pagedims = self.get_map_to_page_dimensions()
1086
+
1087
+ skip_count = 0
1088
+ for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1089
+ if skip_count < from_element:
1090
+ skip_count += 1
1091
+ continue # skip as many items as you want
1092
+
1093
+ if to_element and ix >= to_element:
1094
+ break
1095
+
1096
+ if not isinstance(item, DocItem):
1097
+ continue
1098
+
1099
+ prov = item.prov
1100
+
1101
+ page_i = -1
1102
+
1103
+ if add_location and len(self.pages) and len(prov) > 0:
1104
+
1105
+ page_i = prov[0].page_no
1106
+ page_dim = self.pages[page_i].size
1107
+
1108
+ float(page_dim.width)
1109
+ float(page_dim.height)
1110
+
1111
+ item_type = item.label
1112
+ if isinstance(item, TextItem) and (item_type in labels):
1113
+
1114
+ doctags += item.export_to_document_tokens(
1115
+ doc=self,
1116
+ new_line=new_line,
1117
+ xsize=xsize,
1118
+ ysize=ysize,
1119
+ add_location=add_location,
1120
+ add_content=add_content,
1121
+ add_page_index=add_page_index,
1122
+ )
1123
+
1124
+ elif isinstance(item, TableItem) and (item_type in labels):
1125
+
1126
+ doctags += item.export_to_document_tokens(
1127
+ doc=self,
1128
+ new_line=new_line,
1129
+ xsize=xsize,
1130
+ ysize=ysize,
1131
+ add_caption=True,
1132
+ add_location=add_location,
1133
+ add_content=add_content,
1134
+ add_cell_location=add_table_cell_location,
1135
+ add_cell_label=add_table_cell_label,
1136
+ add_cell_text=add_table_cell_text,
1137
+ add_page_index=add_page_index,
1138
+ )
1139
+
1140
+ elif isinstance(item, PictureItem) and (item_type in labels):
1141
+
1142
+ doctags += item.export_to_document_tokens(
1143
+ doc=self,
1144
+ new_line=new_line,
1145
+ xsize=xsize,
1146
+ ysize=ysize,
1147
+ add_caption=True,
1148
+ add_location=add_location,
1149
+ add_content=add_content,
1150
+ add_page_index=add_page_index,
1151
+ )
1152
+
1153
+ doctags += DocumentToken.END_DOCUMENT.value
1154
+
1155
+ return doctags
1156
+
1157
+ def add_page(self, page_no: int, size: Size) -> PageItem:
1158
+ """add_page.
1159
+
1160
+ :param page_no: int:
1161
+ :param size: Size:
1162
+
1163
+ """
1164
+ pitem = PageItem(page_no=page_no, size=size)
1165
+
1166
+ self.pages[page_no] = pitem
1167
+ return pitem
1168
+
1169
+ @field_validator("version")
1170
+ @classmethod
1171
+ def check_version_is_compatible(cls, v: str) -> str:
1172
+ """Check if this document version is compatible with current version."""
1173
+ current_match = re.match(VERSION_PATTERN, CURRENT_VERSION)
1174
+ doc_match = re.match(VERSION_PATTERN, v)
1175
+ if (
1176
+ doc_match is None
1177
+ or current_match is None
1178
+ or doc_match["major"] != current_match["major"]
1179
+ or doc_match["minor"] > current_match["minor"]
1180
+ ):
1181
+ raise ValueError(
1182
+ f"incompatible version {v} with schema version {CURRENT_VERSION}"
1183
+ )
1184
+ else:
1185
+ return CURRENT_VERSION
1186
+
1187
+ @model_validator(mode="after") # type: ignore
1188
+ @classmethod
1189
+ def validate_document(cls, d: "DoclingDocument"):
1190
+ """validate_document."""
1191
+ if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
1192
+ raise ValueError("Document hierachy is inconsistent.")
1193
+
1194
+ return d