docling-core 1.7.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (36) hide show
  1. docling_core/transforms/chunker/__init__.py +2 -8
  2. docling_core/transforms/chunker/base.py +27 -40
  3. docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
  4. docling_core/types/__init__.py +12 -8
  5. docling_core/types/doc/__init__.py +25 -0
  6. docling_core/types/doc/base.py +136 -451
  7. docling_core/types/doc/document.py +1288 -559
  8. docling_core/types/{experimental → doc}/labels.py +4 -1
  9. docling_core/types/legacy_doc/__init__.py +6 -0
  10. docling_core/types/legacy_doc/base.py +485 -0
  11. docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
  12. docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
  13. docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
  14. docling_core/types/legacy_doc/document.py +715 -0
  15. docling_core/types/rec/subject.py +1 -1
  16. docling_core/utils/generate_docs.py +82 -0
  17. docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
  18. docling_core/utils/validators.py +3 -3
  19. {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/METADATA +11 -11
  20. {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/RECORD +24 -31
  21. docling_core-2.0.0.dist-info/entry_points.txt +5 -0
  22. docling_core/transforms/id_generator/__init__.py +0 -12
  23. docling_core/transforms/id_generator/base.py +0 -30
  24. docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
  25. docling_core/transforms/id_generator/uuid_generator.py +0 -34
  26. docling_core/transforms/metadata_extractor/__init__.py +0 -13
  27. docling_core/transforms/metadata_extractor/base.py +0 -59
  28. docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
  29. docling_core/types/experimental/__init__.py +0 -30
  30. docling_core/types/experimental/base.py +0 -167
  31. docling_core/types/experimental/document.py +0 -1192
  32. docling_core/utils/ds_generate_docs.py +0 -144
  33. docling_core-1.7.1.dist-info/entry_points.txt +0 -5
  34. /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
  35. {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/LICENSE +0 -0
  36. {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/WHEEL +0 -0
@@ -1,1192 +0,0 @@
1
- """Models for the Docling Document data type."""
2
-
3
- import mimetypes
4
- import re
5
- import typing
6
- from typing import Any, Dict, Final, List, Optional, Tuple, Union
7
-
8
- import pandas as pd
9
- from pydantic import (
10
- AnyUrl,
11
- BaseModel,
12
- ConfigDict,
13
- Field,
14
- StringConstraints,
15
- computed_field,
16
- field_validator,
17
- model_validator,
18
- )
19
- from tabulate import tabulate
20
- from typing_extensions import Annotated
21
-
22
- from docling_core.search.package import VERSION_PATTERN
23
- from docling_core.types.base import _JSON_POINTER_REGEX
24
- from docling_core.types.doc.tokens import DocumentToken
25
- from docling_core.types.experimental import BoundingBox, Size
26
- from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
27
-
28
- Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
29
- LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
30
- CURRENT_VERSION: Final = "1.0.0"
31
-
32
-
33
- class BasePictureData(BaseModel): # TBD
34
- """BasePictureData."""
35
-
36
-
37
- class TableCell(BaseModel):
38
- """TableCell."""
39
-
40
- bbox: Optional[BoundingBox] = None
41
- row_span: int = 1
42
- col_span: int = 1
43
- start_row_offset_idx: int
44
- end_row_offset_idx: int
45
- start_col_offset_idx: int
46
- end_col_offset_idx: int
47
- text: str
48
- column_header: bool = False
49
- row_header: bool = False
50
- row_section: bool = False
51
-
52
- @model_validator(mode="before")
53
- @classmethod
54
- def from_dict_format(cls, data: Any) -> Any:
55
- """from_dict_format."""
56
- if isinstance(data, Dict):
57
- if "bbox" not in data or data["bbox"] is None:
58
- return data
59
- text = data["bbox"].get("token", "")
60
- if not len(text):
61
- text_cells = data.pop("text_cell_bboxes", None)
62
- if text_cells:
63
- for el in text_cells:
64
- text += el["token"] + " "
65
-
66
- text = text.strip()
67
- data["text"] = text
68
-
69
- return data
70
-
71
-
72
- class BaseTableData(BaseModel): # TBD
73
- """BaseTableData."""
74
-
75
- table_cells: List[TableCell] = []
76
- num_rows: int = 0
77
- num_cols: int = 0
78
-
79
- @computed_field # type: ignore
80
- @property
81
- def grid(
82
- self,
83
- ) -> List[List[TableCell]]:
84
- """grid."""
85
- # Initialise empty table data grid (only empty cells)
86
- table_data = [
87
- [
88
- TableCell(
89
- text="",
90
- start_row_offset_idx=i,
91
- end_row_offset_idx=i + 1,
92
- start_col_offset_idx=j,
93
- end_col_offset_idx=j + 1,
94
- )
95
- for j in range(self.num_cols)
96
- ]
97
- for i in range(self.num_rows)
98
- ]
99
-
100
- # Overwrite cells in table data for which there is actual cell content.
101
- for cell in self.table_cells:
102
- for i in range(
103
- min(cell.start_row_offset_idx, self.num_rows),
104
- min(cell.end_row_offset_idx, self.num_rows),
105
- ):
106
- for j in range(
107
- min(cell.start_col_offset_idx, self.num_cols),
108
- min(cell.end_col_offset_idx, self.num_cols),
109
- ):
110
- table_data[i][j] = cell
111
-
112
- return table_data
113
-
114
-
115
- class DocumentOrigin(BaseModel):
116
- """FileSource."""
117
-
118
- mimetype: str # the mimetype of the original file
119
- binary_hash: Uint64 # the binary hash of the original file.
120
- # TODO: Change to be Uint64 and provide utility method to generate
121
-
122
- filename: str # The name of the original file, including extension, without path.
123
- # Could stem from filesystem, source URI, Content-Disposition header, ...
124
-
125
- uri: Optional[AnyUrl] = (
126
- None # any possible reference to a source file,
127
- # from any file handler protocol (e.g. https://, file://, s3://)
128
- )
129
-
130
- @field_validator("binary_hash", mode="before")
131
- @classmethod
132
- def parse_hex_string(cls, value):
133
- """parse_hex_string."""
134
- if isinstance(value, str):
135
- try:
136
- # Convert hex string to an integer
137
- hash_int = Uint64(value, 16)
138
- # Mask to fit within 64 bits (unsigned)
139
- return (
140
- hash_int & 0xFFFFFFFFFFFFFFFF
141
- ) # TODO be sure it doesn't clip uint64 max
142
- except ValueError:
143
- raise ValueError(f"Invalid sha256 hexdigest: {value}")
144
- return value # If already an int, return it as is.
145
-
146
- @field_validator("mimetype")
147
- @classmethod
148
- def validate_mimetype(cls, v):
149
- """validate_mimetype."""
150
- # Check if the provided MIME type is valid using mimetypes module
151
- if v not in mimetypes.types_map.values():
152
- raise ValueError(f"'{v}' is not a valid MIME type")
153
- return v
154
-
155
-
156
- class RefItem(BaseModel):
157
- """RefItem."""
158
-
159
- cref: str = Field(alias="$ref", pattern=_JSON_POINTER_REGEX)
160
-
161
- # This method makes RefItem compatible with DocItem
162
- def get_ref(self):
163
- """get_ref."""
164
- return self
165
-
166
- model_config = ConfigDict(
167
- populate_by_name=True,
168
- )
169
-
170
- def resolve(self, doc: "DoclingDocument"):
171
- """resolve."""
172
- path_components = self.cref.split("/")
173
- if (num_comps := len(path_components)) == 3:
174
- _, path, index_str = path_components
175
- index = int(index_str)
176
- obj = doc.__getattribute__(path)[index]
177
- elif num_comps == 2:
178
- _, path = path_components
179
- obj = doc.__getattribute__(path)
180
- else:
181
- raise RuntimeError(f"Unsupported number of path components: {num_comps}")
182
- return obj
183
-
184
-
185
- class ImageRef(BaseModel):
186
- """ImageRef."""
187
-
188
- mimetype: str
189
- dpi: int
190
- size: Size
191
- uri: AnyUrl
192
-
193
- @field_validator("mimetype")
194
- @classmethod
195
- def validate_mimetype(cls, v):
196
- """validate_mimetype."""
197
- # Check if the provided MIME type is valid using mimetypes module
198
- if v not in mimetypes.types_map.values():
199
- raise ValueError(f"'{v}' is not a valid MIME type")
200
- return v
201
-
202
-
203
- class ProvenanceItem(BaseModel):
204
- """ProvenanceItem."""
205
-
206
- page_no: int
207
- bbox: BoundingBox
208
- charspan: Tuple[int, int]
209
-
210
-
211
- class NodeItem(BaseModel):
212
- """NodeItem."""
213
-
214
- self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
215
- parent: Optional[RefItem] = None
216
- children: List[RefItem] = []
217
-
218
- model_config = ConfigDict(extra="forbid")
219
-
220
- def get_ref(self):
221
- """get_ref."""
222
- return RefItem(cref=self.self_ref)
223
-
224
-
225
- class GroupItem(NodeItem): # Container type, can't be a leaf node
226
- """GroupItem."""
227
-
228
- name: str = (
229
- "group" # Name of the group, e.g. "Introduction Chapter",
230
- # "Slide 5", "Navigation menu list", ...
231
- )
232
- label: GroupLabel = GroupLabel.UNSPECIFIED
233
-
234
-
235
- class DocItem(
236
- NodeItem
237
- ): # Base type for any element that carries content, can be a leaf node
238
- """DocItem."""
239
-
240
- label: DocItemLabel
241
- prov: List[ProvenanceItem] = []
242
-
243
- def get_location_tokens(
244
- self,
245
- doc: "DoclingDocument",
246
- new_line: str,
247
- xsize: int = 100,
248
- ysize: int = 100,
249
- add_page_index: bool = True,
250
- ) -> str:
251
- """Get the location string for the BaseCell."""
252
- if not len(self.prov):
253
- return ""
254
-
255
- location = ""
256
- for prov in self.prov:
257
- page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
258
-
259
- page_i = -1
260
- if add_page_index:
261
- page_i = prov.page_no
262
-
263
- loc_str = DocumentToken.get_location(
264
- bbox=prov.bbox.to_bottom_left_origin(page_h).as_tuple(),
265
- page_w=page_w,
266
- page_h=page_h,
267
- xsize=xsize,
268
- ysize=ysize,
269
- page_i=page_i,
270
- )
271
- location += f"{loc_str}{new_line}"
272
-
273
- return location
274
-
275
-
276
- class TextItem(DocItem):
277
- """TextItem."""
278
-
279
- orig: str # untreated representation
280
- text: str # sanitized representation
281
-
282
- def export_to_document_tokens(
283
- self,
284
- doc: "DoclingDocument",
285
- new_line: str = "\n",
286
- xsize: int = 100,
287
- ysize: int = 100,
288
- add_location: bool = True,
289
- add_content: bool = True,
290
- add_page_index: bool = True,
291
- ):
292
- r"""Export text element to document tokens format.
293
-
294
- :param doc: "DoclingDocument":
295
- :param new_line: str: (Default value = "\n")
296
- :param xsize: int: (Default value = 100)
297
- :param ysize: int: (Default value = 100)
298
- :param add_location: bool: (Default value = True)
299
- :param add_content: bool: (Default value = True)
300
- :param add_page_index: bool: (Default value = True)
301
-
302
- """
303
- body = f"<{self.label.value}>"
304
-
305
- # TODO: This must be done through an explicit mapping.
306
- # assert DocumentToken.is_known_token(
307
- # body
308
- # ), f"failed DocumentToken.is_known_token({body})"
309
-
310
- if add_location:
311
- body += self.get_location_tokens(
312
- doc=doc,
313
- new_line="",
314
- xsize=xsize,
315
- ysize=ysize,
316
- add_page_index=add_page_index,
317
- )
318
-
319
- if add_content and self.text is not None:
320
- body += self.text.strip()
321
-
322
- body += f"</{self.label.value}>{new_line}"
323
-
324
- return body
325
-
326
-
327
- class SectionHeaderItem(TextItem):
328
- """SectionItem."""
329
-
330
- label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
331
- level: LevelNumber
332
-
333
-
334
- class FloatingItem(DocItem):
335
- """FloatingItem."""
336
-
337
- captions: List[RefItem] = []
338
- references: List[RefItem] = []
339
- footnotes: List[RefItem] = []
340
- image: Optional[ImageRef] = None
341
-
342
-
343
- class PictureItem(FloatingItem):
344
- """PictureItem."""
345
-
346
- label: typing.Literal[DocItemLabel.PICTURE] = DocItemLabel.PICTURE
347
-
348
- data: BasePictureData
349
-
350
- def export_to_document_tokens(
351
- self,
352
- doc: "DoclingDocument",
353
- new_line: str = "\n",
354
- xsize: int = 100,
355
- ysize: int = 100,
356
- add_location: bool = True,
357
- add_caption: bool = True,
358
- add_content: bool = True, # not used at the moment
359
- add_page_index: bool = True,
360
- ):
361
- r"""Export picture to document tokens format.
362
-
363
- :param doc: "DoclingDocument":
364
- :param new_line: str: (Default value = "\n")
365
- :param xsize: int: (Default value = 100)
366
- :param ysize: int: (Default value = 100)
367
- :param add_location: bool: (Default value = True)
368
- :param add_caption: bool: (Default value = True)
369
- :param add_content: bool: (Default value = True)
370
- :param # not used at the momentadd_page_index: bool: (Default value = True)
371
-
372
- """
373
- body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
374
-
375
- if add_location:
376
- body += self.get_location_tokens(
377
- doc=doc,
378
- new_line=new_line,
379
- xsize=xsize,
380
- ysize=ysize,
381
- add_page_index=add_page_index,
382
- )
383
-
384
- if add_caption and len(self.captions):
385
- text = ""
386
- for cap in self.captions:
387
- text += cap.resolve(doc).text
388
-
389
- if len(text):
390
- body += f"{DocumentToken.BEG_CAPTION.value}"
391
- body += f"{text.strip()}"
392
- body += f"{DocumentToken.END_CAPTION.value}"
393
- body += f"{new_line}"
394
-
395
- body += f"{DocumentToken.END_FIGURE.value}{new_line}"
396
-
397
- return body
398
-
399
-
400
- class TableItem(FloatingItem):
401
- """TableItem."""
402
-
403
- data: BaseTableData
404
- label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
405
-
406
- def export_to_dataframe(self) -> pd.DataFrame:
407
- """Export the table as a Pandas DataFrame."""
408
- if self.data is None or self.data.num_rows == 0 or self.data.num_cols == 0:
409
- return pd.DataFrame()
410
-
411
- # Count how many rows are column headers
412
- num_headers = 0
413
- for i, row in enumerate(self.data.grid):
414
- if len(row) == 0:
415
- raise RuntimeError(
416
- f"Invalid table. {len(row)=} but {self.data.num_cols=}."
417
- )
418
-
419
- any_header = False
420
- for cell in row:
421
- if cell.column_header:
422
- any_header = True
423
- break
424
-
425
- if any_header:
426
- num_headers += 1
427
- else:
428
- break
429
-
430
- # Create the column names from all col_headers
431
- columns: Optional[List[str]] = None
432
- if num_headers > 0:
433
- columns = ["" for _ in range(self.data.num_cols)]
434
- for i in range(num_headers):
435
- for j, cell in enumerate(self.data.grid[i]):
436
- col_name = cell.text
437
- if columns[j] != "":
438
- col_name = f".{col_name}"
439
- columns[j] += col_name
440
-
441
- # Create table data
442
- table_data = [
443
- [cell.text for cell in row] for row in self.data.grid[num_headers:]
444
- ]
445
-
446
- # Create DataFrame
447
- df = pd.DataFrame(table_data, columns=columns)
448
-
449
- return df
450
-
451
- def export_to_html(self) -> str:
452
- """Export the table as html."""
453
- body = ""
454
- nrows = self.data.num_rows
455
- ncols = self.data.num_cols
456
-
457
- if not len(self.data.table_cells):
458
- return ""
459
- for i in range(nrows):
460
- body += "<tr>"
461
- for j in range(ncols):
462
- cell: TableCell = self.data.grid[i][j]
463
-
464
- rowspan, rowstart = (
465
- cell.row_span,
466
- cell.start_row_offset_idx,
467
- )
468
- colspan, colstart = (
469
- cell.col_span,
470
- cell.start_col_offset_idx,
471
- )
472
-
473
- if rowstart != i:
474
- continue
475
- if colstart != j:
476
- continue
477
-
478
- content = cell.text.strip()
479
- celltag = "td"
480
- if cell.column_header:
481
- celltag = "th"
482
-
483
- opening_tag = f"{celltag}"
484
- if rowspan > 1:
485
- opening_tag += f' rowspan="{rowspan}"'
486
- if colspan > 1:
487
- opening_tag += f' colspan="{colspan}"'
488
-
489
- body += f"<{opening_tag}>{content}</{celltag}>"
490
- body += "</tr>"
491
- body = f"<table>{body}</table>"
492
-
493
- return body
494
-
495
- def export_to_document_tokens(
496
- self,
497
- doc: "DoclingDocument",
498
- new_line: str = "\n",
499
- xsize: int = 100,
500
- ysize: int = 100,
501
- add_location: bool = True,
502
- add_caption: bool = True,
503
- add_content: bool = True,
504
- add_cell_location: bool = True,
505
- add_cell_label: bool = True,
506
- add_cell_text: bool = True,
507
- add_page_index: bool = True,
508
- ):
509
- r"""Export table to document tokens format.
510
-
511
- :param doc: "DoclingDocument":
512
- :param new_line: str: (Default value = "\n")
513
- :param xsize: int: (Default value = 100)
514
- :param ysize: int: (Default value = 100)
515
- :param add_location: bool: (Default value = True)
516
- :param add_caption: bool: (Default value = True)
517
- :param add_content: bool: (Default value = True)
518
- :param add_cell_location: bool: (Default value = True)
519
- :param add_cell_label: bool: (Default value = True)
520
- :param add_cell_text: bool: (Default value = True)
521
- :param add_page_index: bool: (Default value = True)
522
-
523
- """
524
- body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
525
-
526
- if add_location:
527
- body += self.get_location_tokens(
528
- doc=doc,
529
- new_line=new_line,
530
- xsize=xsize,
531
- ysize=ysize,
532
- add_page_index=add_page_index,
533
- )
534
-
535
- if add_caption and len(self.captions):
536
- text = ""
537
- for cap in self.captions:
538
- text += cap.resolve(doc).text
539
-
540
- if len(text):
541
- body += f"{DocumentToken.BEG_CAPTION.value}"
542
- body += f"{text.strip()}"
543
- body += f"{DocumentToken.END_CAPTION.value}"
544
- body += f"{new_line}"
545
-
546
- if add_content and len(self.data.table_cells) > 0:
547
- for i, row in enumerate(self.data.grid):
548
- body += f"<row_{i}>"
549
- for j, col in enumerate(row):
550
-
551
- text = ""
552
- if add_cell_text:
553
- text = col.text.strip()
554
-
555
- cell_loc = ""
556
- if (
557
- col.bbox is not None
558
- and add_cell_location
559
- and add_page_index
560
- and len(self.prov) > 0
561
- ):
562
- page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
563
- cell_loc = DocumentToken.get_location(
564
- bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
565
- page_w=page_w,
566
- page_h=page_h,
567
- xsize=xsize,
568
- ysize=ysize,
569
- page_i=self.prov[0].page_no,
570
- )
571
- elif (
572
- col.bbox is not None
573
- and add_cell_location
574
- and not add_page_index
575
- and len(self.prov) > 0
576
- ):
577
- page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
578
-
579
- cell_loc = DocumentToken.get_location(
580
- bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
581
- page_w=page_w,
582
- page_h=page_h,
583
- xsize=xsize,
584
- ysize=ysize,
585
- page_i=-1,
586
- )
587
-
588
- cell_label = ""
589
- if add_cell_label:
590
- if col.column_header:
591
- cell_label = "<col_header>"
592
- elif col.row_header:
593
- cell_label = "<row_header>"
594
- elif col.row_section:
595
- cell_label = "<row_section>"
596
- else:
597
- cell_label = "<body>"
598
-
599
- body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
600
-
601
- body += f"</row_{i}>{new_line}"
602
-
603
- body += f"{DocumentToken.END_TABLE.value}{new_line}"
604
-
605
- return body
606
-
607
-
608
- class KeyValueItem(DocItem):
609
- """KeyValueItem."""
610
-
611
-
612
- ContentItem = Union[TextItem, SectionHeaderItem, PictureItem, TableItem, KeyValueItem]
613
-
614
-
615
- class PageItem(BaseModel):
616
- """PageItem."""
617
-
618
- # A page carries separate root items for furniture and body,
619
- # only referencing items on the page
620
- size: Size
621
- image: Optional[ImageRef] = None
622
- page_no: int
623
-
624
-
625
- class DescriptionItem(BaseModel):
626
- """DescriptionItem."""
627
-
628
-
629
- class DoclingDocument(BaseModel):
630
- """DoclingDocument."""
631
-
632
- schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
633
- version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
634
- CURRENT_VERSION
635
- )
636
- description: DescriptionItem
637
- name: str # The working name of this document, without extensions
638
- # (could be taken from originating doc, or just "Untitled 1")
639
- origin: Optional[DocumentOrigin] = (
640
- None # DoclingDocuments may specify an origin (converted to DoclingDocument).
641
- # This is optional, e.g. a DoclingDocument could also be entirely
642
- # generated from synthetic data.
643
- )
644
-
645
- furniture: GroupItem = GroupItem(
646
- name="_root_", self_ref="#/furniture"
647
- ) # List[RefItem] = []
648
- body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
649
-
650
- groups: List[GroupItem] = []
651
- texts: List[Union[SectionHeaderItem, TextItem]] = []
652
- pictures: List[PictureItem] = []
653
- tables: List[TableItem] = []
654
- key_value_items: List[KeyValueItem] = []
655
-
656
- pages: Dict[int, PageItem] = {} # empty as default
657
-
658
- def add_group(
659
- self,
660
- label: Optional[GroupLabel] = None,
661
- name: Optional[str] = None,
662
- parent: Optional[GroupItem] = None,
663
- ) -> GroupItem:
664
- """add_group.
665
-
666
- :param label: Optional[GroupLabel]: (Default value = None)
667
- :param name: Optional[str]: (Default value = None)
668
- :param parent: Optional[GroupItem]: (Default value = None)
669
-
670
- """
671
- if not parent:
672
- parent = self.body
673
-
674
- group_index = len(self.groups)
675
- cref = f"#/groups/{group_index}"
676
-
677
- group = GroupItem(self_ref=cref, parent=parent.get_ref())
678
- if name is not None:
679
- group.name = name
680
- if label is not None:
681
- group.label = label
682
-
683
- self.groups.append(group)
684
- parent.children.append(RefItem(cref=cref))
685
-
686
- return group
687
-
688
- def add_text(
689
- self,
690
- label: str,
691
- text: str,
692
- orig: Optional[str] = None,
693
- prov: Optional[ProvenanceItem] = None,
694
- parent: Optional[GroupItem] = None,
695
- ):
696
- """add_paragraph.
697
-
698
- :param label: str:
699
- :param text: str:
700
- :param orig: Optional[str]: (Default value = None)
701
- :param prov: Optional[ProvenanceItem]: (Default value = None)
702
- :param parent: Optional[GroupItem]: (Default value = None)
703
-
704
- """
705
- if not parent:
706
- parent = self.body
707
-
708
- if not orig:
709
- orig = text
710
-
711
- text_index = len(self.texts)
712
- cref = f"#/texts/{text_index}"
713
- text_item = TextItem(
714
- label=label,
715
- text=text,
716
- orig=orig,
717
- self_ref=cref,
718
- parent=parent.get_ref(),
719
- )
720
- if prov:
721
- text_item.prov.append(prov)
722
-
723
- self.texts.append(text_item)
724
- parent.children.append(RefItem(cref=cref))
725
-
726
- return text_item
727
-
728
- def add_table(
729
- self,
730
- data: BaseTableData,
731
- caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
732
- prov: Optional[ProvenanceItem] = None,
733
- parent: Optional[GroupItem] = None,
734
- ):
735
- """add_table.
736
-
737
- :param data: BaseTableData:
738
- :param caption: Optional[Union[TextItem:
739
- :param RefItem]]: (Default value = None)
740
- :param # This is not cool yet.prov: Optional[ProvenanceItem]
741
- :param parent: Optional[GroupItem]: (Default value = None)
742
-
743
- """
744
- if not parent:
745
- parent = self.body
746
-
747
- table_index = len(self.tables)
748
- cref = f"#/tables/{table_index}"
749
-
750
- tbl_item = TableItem(
751
- label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
752
- )
753
- if prov:
754
- tbl_item.prov.append(prov)
755
- if caption:
756
- tbl_item.captions.append(caption.get_ref())
757
-
758
- self.tables.append(tbl_item)
759
- parent.children.append(RefItem(cref=cref))
760
-
761
- return tbl_item
762
-
763
- def add_picture(
764
- self,
765
- data: BasePictureData,
766
- caption: Optional[Union[TextItem, RefItem]] = None,
767
- prov: Optional[ProvenanceItem] = None,
768
- parent: Optional[GroupItem] = None,
769
- ):
770
- """add_picture.
771
-
772
- :param data: BasePictureData:
773
- :param caption: Optional[Union[TextItem:
774
- :param RefItem]]: (Default value = None)
775
- :param prov: Optional[ProvenanceItem]: (Default value = None)
776
- :param parent: Optional[GroupItem]: (Default value = None)
777
-
778
- """
779
- if not parent:
780
- parent = self.body
781
-
782
- picture_index = len(self.pictures)
783
- cref = f"#/pictures/{picture_index}"
784
-
785
- fig_item = PictureItem(
786
- label=DocItemLabel.PICTURE,
787
- data=data,
788
- self_ref=cref,
789
- parent=parent.get_ref(),
790
- )
791
- if prov:
792
- fig_item.prov.append(prov)
793
- if caption:
794
- fig_item.captions.append(caption.get_ref())
795
-
796
- self.pictures.append(fig_item)
797
- parent.children.append(RefItem(cref=cref))
798
-
799
- return fig_item
800
-
801
- def add_heading(
802
- self,
803
- text: str,
804
- orig: Optional[str] = None,
805
- level: LevelNumber = 1,
806
- prov: Optional[ProvenanceItem] = None,
807
- parent: Optional[GroupItem] = None,
808
- ):
809
- """add_heading.
810
-
811
- :param label: DocItemLabel:
812
- :param text: str:
813
- :param orig: Optional[str]: (Default value = None)
814
- :param level: LevelNumber: (Default value = 1)
815
- :param prov: Optional[ProvenanceItem]: (Default value = None)
816
- :param parent: Optional[GroupItem]: (Default value = None)
817
-
818
- """
819
- if not parent:
820
- parent = self.body
821
-
822
- if not orig:
823
- orig = text
824
-
825
- text_index = len(self.texts)
826
- cref = f"#/texts/{text_index}"
827
- section_header_item = SectionHeaderItem(
828
- level=level,
829
- text=text,
830
- orig=orig,
831
- self_ref=cref,
832
- parent=parent.get_ref(),
833
- )
834
- if prov:
835
- section_header_item.prov.append(prov)
836
-
837
- self.texts.append(section_header_item)
838
- parent.children.append(RefItem(cref=cref))
839
-
840
- return section_header_item
841
-
842
- def num_pages(self):
843
- """num_pages."""
844
- return len(self.pages.values())
845
-
846
- def validate_tree(self, root) -> bool:
847
- """validate_tree."""
848
- res = []
849
- for child_ref in root.children:
850
- child = child_ref.resolve(self)
851
- if child.parent.resolve(self) != root:
852
- return False
853
- res.append(self.validate_tree(child))
854
-
855
- return all(res) or len(res) == 0
856
-
857
- def iterate_items(
858
- self,
859
- root: Optional[NodeItem] = None,
860
- with_groups: bool = False,
861
- traverse_pictures: bool = True,
862
- page_no: Optional[int] = None,
863
- _level: int = 0, # fixed parameter, carries through the node nesting level
864
- ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
865
- """iterate_elements.
866
-
867
- :param root: Optional[NodeItem]: (Default value = None)
868
- :param with_groups: bool: (Default value = False)
869
- :param traverse_pictures: bool: (Default value = True)
870
- :param page_no: Optional[int]: (Default value = None)
871
- :param _level: (Default value = 0)
872
- :param # fixed parameter:
873
- :param carries through the node nesting level:
874
- """
875
- if not root:
876
- root = self.body
877
-
878
- if not isinstance(root, GroupItem) or with_groups:
879
- if isinstance(root, DocItem):
880
- if page_no is not None:
881
- for prov in root.prov:
882
- if prov.page_no == page_no:
883
- yield root, _level
884
- else:
885
- yield root, _level
886
- else:
887
- yield root, _level
888
-
889
- # Traverse children
890
- for child_ref in root.children:
891
- child = child_ref.resolve(self)
892
-
893
- if isinstance(child, NodeItem):
894
- # If the child is a NodeItem, recursively traverse it
895
- if not isinstance(child, PictureItem) or traverse_pictures:
896
- yield from self.iterate_items(
897
- child, _level=_level + 1, with_groups=with_groups
898
- )
899
-
900
- def print_element_tree(self):
901
- """print_element_tree."""
902
- for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
903
- if isinstance(item, GroupItem):
904
- print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
905
- elif isinstance(item, DocItem):
906
- print(" " * level, f"{ix}: {item.label.value}")
907
-
908
- def export_to_markdown(
909
- self,
910
- delim: str = "\n\n",
911
- from_element: int = 0,
912
- to_element: Optional[int] = None,
913
- labels: list[DocItemLabel] = [
914
- DocItemLabel.TITLE,
915
- DocItemLabel.SECTION_HEADER,
916
- DocItemLabel.PARAGRAPH,
917
- DocItemLabel.CAPTION,
918
- DocItemLabel.TABLE,
919
- DocItemLabel.TEXT,
920
- ],
921
- strict_text: bool = False,
922
- ) -> str:
923
- r"""Serialize to Markdown.
924
-
925
- Operates on a slice of the document's main_text as defined through arguments
926
- main_text_start and main_text_stop; defaulting to the whole main_text.
927
-
928
- :param delim: Delimiter to use when concatenating the various
929
- Markdown parts. Defaults to "\n\n".
930
- :type delim: str
931
- :param from_element: Body slicing start index (inclusive).
932
- Defaults to 0.
933
- :type from_element: int
934
- :param to_element: Body slicing stop index
935
- (exclusive). Defaults to None.
936
- :type to_element: Optional[int]
937
- :param delim: str: (Default value = "\n\n")
938
- :param from_element: int: (Default value = 0)
939
- :param to_element: Optional[int]: (Default value = None)
940
- :param labels: list[DocItemLabel]
941
- :param "subtitle-level-1":
942
- :param "paragraph":
943
- :param "caption":
944
- :param "table":
945
- :param "Text":
946
- :param "text":
947
- :param ]:
948
- :param strict_text: bool: (Default value = False)
949
- :returns: The exported Markdown representation.
950
- :rtype: str
951
- """
952
- has_title = False
953
- prev_text = ""
954
- md_texts: list[str] = []
955
-
956
- skip_count = 0
957
- for ix, (item, level) in enumerate(self.iterate_items(self.body)):
958
- if skip_count < from_element:
959
- skip_count += 1
960
- continue # skip as many items as you want
961
-
962
- if to_element and ix >= to_element:
963
- break
964
-
965
- markdown_text = ""
966
-
967
- if isinstance(item, DocItem):
968
- item_type = item.label
969
-
970
- if isinstance(item, TextItem) and item_type in labels:
971
- text = item.text
972
-
973
- # ignore repeated text
974
- if prev_text == text or text is None:
975
- continue
976
- else:
977
- prev_text = text
978
-
979
- # first title match
980
- if item_type == "title" and not has_title:
981
- if strict_text:
982
- markdown_text = f"{text}"
983
- else:
984
- markdown_text = f"# {text}"
985
- has_title = True
986
-
987
- # secondary titles
988
- elif item_type in {"title", "subtitle-level-1"} or (
989
- has_title and item_type == "title"
990
- ):
991
- if strict_text:
992
- markdown_text = f"{text}"
993
- else:
994
- markdown_text = f"## {text}"
995
-
996
- # normal text
997
- else:
998
- markdown_text = text
999
-
1000
- elif (
1001
- isinstance(item, TableItem)
1002
- and item.data
1003
- and item_type in labels
1004
- and not strict_text
1005
- ):
1006
- table = []
1007
- for row in item.data.grid:
1008
- tmp = []
1009
- for col in row:
1010
- tmp.append(col.text)
1011
- table.append(tmp)
1012
-
1013
- if len(table) > 1 and len(table[0]) > 0:
1014
- try:
1015
- md_table = tabulate(
1016
- table[1:], headers=table[0], tablefmt="github"
1017
- )
1018
- except ValueError:
1019
- md_table = tabulate(
1020
- table[1:],
1021
- headers=table[0],
1022
- tablefmt="github",
1023
- disable_numparse=True,
1024
- )
1025
-
1026
- markdown_text = md_table
1027
-
1028
- if markdown_text:
1029
- md_texts.append(markdown_text)
1030
-
1031
- result = delim.join(md_texts)
1032
- return result
1033
-
1034
- def export_to_document_tokens(
1035
- self,
1036
- delim: str = "\n\n",
1037
- from_element: int = 0,
1038
- to_element: Optional[int] = None,
1039
- labels: list[DocItemLabel] = [
1040
- DocItemLabel.TITLE,
1041
- DocItemLabel.SECTION_HEADER,
1042
- DocItemLabel.PARAGRAPH,
1043
- DocItemLabel.CAPTION,
1044
- DocItemLabel.TABLE,
1045
- DocItemLabel.TEXT,
1046
- ],
1047
- xsize: int = 100,
1048
- ysize: int = 100,
1049
- add_location: bool = True,
1050
- add_content: bool = True,
1051
- add_page_index: bool = True,
1052
- # table specific flags
1053
- add_table_cell_location: bool = False,
1054
- add_table_cell_label: bool = True,
1055
- add_table_cell_text: bool = True,
1056
- ) -> str:
1057
- r"""Exports the document content to an DocumentToken format.
1058
-
1059
- Operates on a slice of the document's body as defined through arguments
1060
- from_element and to_element; defaulting to the whole main_text.
1061
-
1062
- :param delim: str: (Default value = "\n\n")
1063
- :param from_element: int: (Default value = 0)
1064
- :param to_element: Optional[int]: (Default value = None)
1065
- :param labels: list[DocItemLabel]
1066
- :param xsize: int: (Default value = 100)
1067
- :param ysize: int: (Default value = 100)
1068
- :param add_location: bool: (Default value = True)
1069
- :param add_content: bool: (Default value = True)
1070
- :param add_page_index: bool: (Default value = True)
1071
- :param # table specific flagsadd_table_cell_location: bool
1072
- :param add_table_cell_label: bool: (Default value = True)
1073
- :param add_table_cell_text: bool: (Default value = True)
1074
- :returns: The content of the document formatted as a DocTags string.
1075
- :rtype: str
1076
- """
1077
- new_line = ""
1078
- if delim:
1079
- new_line = "\n"
1080
-
1081
- doctags = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
1082
-
1083
- # pagedims = self.get_map_to_page_dimensions()
1084
-
1085
- skip_count = 0
1086
- for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1087
- if skip_count < from_element:
1088
- skip_count += 1
1089
- continue # skip as many items as you want
1090
-
1091
- if to_element and ix >= to_element:
1092
- break
1093
-
1094
- if not isinstance(item, DocItem):
1095
- continue
1096
-
1097
- prov = item.prov
1098
-
1099
- page_i = -1
1100
-
1101
- if add_location and len(self.pages) and len(prov) > 0:
1102
-
1103
- page_i = prov[0].page_no
1104
- page_dim = self.pages[page_i].size
1105
-
1106
- float(page_dim.width)
1107
- float(page_dim.height)
1108
-
1109
- item_type = item.label
1110
- if isinstance(item, TextItem) and (item_type in labels):
1111
-
1112
- doctags += item.export_to_document_tokens(
1113
- doc=self,
1114
- new_line=new_line,
1115
- xsize=xsize,
1116
- ysize=ysize,
1117
- add_location=add_location,
1118
- add_content=add_content,
1119
- add_page_index=add_page_index,
1120
- )
1121
-
1122
- elif isinstance(item, TableItem) and (item_type in labels):
1123
-
1124
- doctags += item.export_to_document_tokens(
1125
- doc=self,
1126
- new_line=new_line,
1127
- xsize=xsize,
1128
- ysize=ysize,
1129
- add_caption=True,
1130
- add_location=add_location,
1131
- add_content=add_content,
1132
- add_cell_location=add_table_cell_location,
1133
- add_cell_label=add_table_cell_label,
1134
- add_cell_text=add_table_cell_text,
1135
- add_page_index=add_page_index,
1136
- )
1137
-
1138
- elif isinstance(item, PictureItem) and (item_type in labels):
1139
-
1140
- doctags += item.export_to_document_tokens(
1141
- doc=self,
1142
- new_line=new_line,
1143
- xsize=xsize,
1144
- ysize=ysize,
1145
- add_caption=True,
1146
- add_location=add_location,
1147
- add_content=add_content,
1148
- add_page_index=add_page_index,
1149
- )
1150
-
1151
- doctags += DocumentToken.END_DOCUMENT.value
1152
-
1153
- return doctags
1154
-
1155
- def add_page(self, page_no: int, size: Size) -> PageItem:
1156
- """add_page.
1157
-
1158
- :param page_no: int:
1159
- :param size: Size:
1160
-
1161
- """
1162
- pitem = PageItem(page_no=page_no, size=size)
1163
-
1164
- self.pages[page_no] = pitem
1165
- return pitem
1166
-
1167
- @field_validator("version")
1168
- @classmethod
1169
- def check_version_is_compatible(cls, v: str) -> str:
1170
- """Check if this document version is compatible with current version."""
1171
- current_match = re.match(VERSION_PATTERN, CURRENT_VERSION)
1172
- doc_match = re.match(VERSION_PATTERN, v)
1173
- if (
1174
- doc_match is None
1175
- or current_match is None
1176
- or doc_match["major"] != current_match["major"]
1177
- or doc_match["minor"] > current_match["minor"]
1178
- ):
1179
- raise ValueError(
1180
- f"incompatible version {v} with schema version {CURRENT_VERSION}"
1181
- )
1182
- else:
1183
- return CURRENT_VERSION
1184
-
1185
- @model_validator(mode="after") # type: ignore
1186
- @classmethod
1187
- def validate_document(cls, d: "DoclingDocument"):
1188
- """validate_document."""
1189
- if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
1190
- raise ValueError("Document hierachy is inconsistent.")
1191
-
1192
- return d