docling-core 1.7.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (36) hide show
  1. docling_core/transforms/chunker/__init__.py +2 -8
  2. docling_core/transforms/chunker/base.py +27 -40
  3. docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
  4. docling_core/types/__init__.py +3 -18
  5. docling_core/types/doc/__init__.py +25 -0
  6. docling_core/types/doc/base.py +136 -451
  7. docling_core/types/doc/document.py +1289 -559
  8. docling_core/types/{experimental → doc}/labels.py +4 -1
  9. docling_core/types/legacy_doc/__init__.py +6 -0
  10. docling_core/types/legacy_doc/base.py +485 -0
  11. docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
  12. docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
  13. docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
  14. docling_core/types/legacy_doc/document.py +715 -0
  15. docling_core/types/rec/subject.py +1 -1
  16. docling_core/utils/generate_docs.py +82 -0
  17. docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
  18. docling_core/utils/validators.py +3 -3
  19. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/METADATA +17 -17
  20. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/RECORD +24 -31
  21. docling_core-2.0.1.dist-info/entry_points.txt +5 -0
  22. docling_core/transforms/id_generator/__init__.py +0 -12
  23. docling_core/transforms/id_generator/base.py +0 -30
  24. docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
  25. docling_core/transforms/id_generator/uuid_generator.py +0 -34
  26. docling_core/transforms/metadata_extractor/__init__.py +0 -13
  27. docling_core/transforms/metadata_extractor/base.py +0 -59
  28. docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
  29. docling_core/types/experimental/__init__.py +0 -30
  30. docling_core/types/experimental/base.py +0 -167
  31. docling_core/types/experimental/document.py +0 -1192
  32. docling_core/utils/ds_generate_docs.py +0 -144
  33. docling_core-1.7.2.dist-info/entry_points.txt +0 -5
  34. /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
  35. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/LICENSE +0 -0
  36. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/WHEEL +0 -0
@@ -1,452 +1,1117 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
1
  """Models for the Docling Document data type."""
7
2
 
8
- from datetime import datetime
9
- from typing import Generic, Optional, Union
3
+ import base64
4
+ import mimetypes
5
+ import re
6
+ import typing
7
+ from io import BytesIO
8
+ from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
10
9
 
10
+ import pandas as pd
11
+ from PIL import Image as PILImage
11
12
  from pydantic import (
12
- AnyHttpUrl,
13
+ AnyUrl,
13
14
  BaseModel,
15
+ ConfigDict,
14
16
  Field,
15
- NonNegativeInt,
16
- StrictStr,
17
+ StringConstraints,
18
+ computed_field,
19
+ field_validator,
17
20
  model_validator,
18
21
  )
19
22
  from tabulate import tabulate
23
+ from typing_extensions import Annotated, Self
20
24
 
21
- from docling_core.search.mapping import es_field
22
- from docling_core.types.base import (
23
- Acquisition,
24
- CollectionDocumentInfo,
25
- CollectionNameTypeT,
26
- DescriptionAdvancedT,
27
- DescriptionAnalyticsT,
28
- FileInfoObject,
29
- Identifier,
30
- IdentifierTypeT,
31
- LanguageT,
32
- Log,
33
- )
34
- from docling_core.types.doc.base import (
35
- BaseCell,
36
- BaseText,
37
- BitmapObject,
38
- Figure,
39
- PageDimensions,
40
- PageReference,
41
- Ref,
42
- S3Data,
43
- Table,
44
- )
45
- from docling_core.types.doc.tokens import DocumentToken
46
- from docling_core.utils.alias import AliasModel
25
+ from docling_core.search.package import VERSION_PATTERN
26
+ from docling_core.types.base import _JSON_POINTER_REGEX
27
+ from docling_core.types.doc import BoundingBox, Size
28
+ from docling_core.types.doc.labels import DocItemLabel, GroupLabel
29
+ from docling_core.types.legacy_doc.tokens import DocumentToken
47
30
 
31
+ Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
32
+ LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
33
+ CURRENT_VERSION: Final = "1.0.0"
48
34
 
49
- class CCSFileInfoDescription(BaseModel, extra="forbid"):
50
- """File info description."""
35
+ DEFAULT_EXPORT_LABELS = {
36
+ DocItemLabel.TITLE,
37
+ DocItemLabel.DOCUMENT_INDEX,
38
+ DocItemLabel.SECTION_HEADER,
39
+ DocItemLabel.PARAGRAPH,
40
+ DocItemLabel.CAPTION,
41
+ DocItemLabel.TABLE,
42
+ DocItemLabel.PICTURE,
43
+ DocItemLabel.FORMULA,
44
+ DocItemLabel.CHECKBOX_UNSELECTED,
45
+ DocItemLabel.CHECKBOX_SELECTED,
46
+ DocItemLabel.TEXT,
47
+ DocItemLabel.LIST_ITEM,
48
+ DocItemLabel.CODE,
49
+ }
51
50
 
52
- author: Optional[list[StrictStr]] = None
53
- keywords: Optional[str] = None
54
- subject: Optional[str] = None
55
- title: Optional[StrictStr] = None
56
- creation_date: Optional[str] = None # datetime
57
51
 
52
+ class BasePictureData(BaseModel):
53
+ """BasePictureData."""
58
54
 
59
- class CCSFileInfoObject(FileInfoObject, extra="forbid"):
60
- """File info object."""
55
+ kind: str
61
56
 
62
- num_pages: Optional[int] = Field(default=None, alias="#-pages")
63
57
 
64
- collection_name: Optional[str] = Field(
65
- default=None,
66
- alias="collection-name",
67
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
68
- )
69
- description: Optional[CCSFileInfoDescription] = Field(
70
- default=None, json_schema_extra=es_field(suppress=True)
71
- )
72
- page_hashes: Optional[list[PageReference]] = Field(
73
- default=None, alias="page-hashes"
74
- )
58
+ class PictureClassificationClass(BaseModel):
59
+ """PictureClassificationData."""
75
60
 
61
+ class_name: str
62
+ confidence: float
76
63
 
77
- class Affiliation(BaseModel, extra="forbid"):
78
- """Affiliation."""
79
-
80
- name: str = Field(
81
- ...,
82
- json_schema_extra=es_field(
83
- fields={
84
- "lower": {
85
- "normalizer": "lowercase_asciifolding",
86
- "type": "keyword",
87
- "ignore_above": 8191,
88
- },
89
- "keyword": {"type": "keyword", "ignore_above": 8191},
90
- },
91
- ),
92
- )
93
- id: Optional[str] = Field(
94
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
95
- )
96
- source: Optional[str] = Field(
97
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
98
- )
99
64
 
65
+ class PictureClassificationData(BasePictureData):
66
+ """PictureClassificationData."""
100
67
 
101
- class Author(BaseModel, extra="forbid"):
102
- """Author."""
103
-
104
- name: str = Field(
105
- ...,
106
- json_schema_extra=es_field(
107
- type="text",
108
- fields={
109
- "lower": {
110
- "normalizer": "lowercase_asciifolding",
111
- "type": "keyword",
112
- "ignore_above": 8191,
113
- },
114
- "keyword": {"type": "keyword", "ignore_above": 8191},
115
- },
116
- ),
117
- )
118
- id: Optional[str] = Field(
119
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
120
- )
121
- source: Optional[str] = Field(
122
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
123
- )
124
- affiliations: Optional[list[Affiliation]] = None
68
+ kind: Literal["classification"] = "classification"
69
+ provenance: str
70
+ predicted_classes: List[PictureClassificationClass]
125
71
 
126
72
 
127
- class Publication(BaseModel, Generic[IdentifierTypeT], extra="forbid"):
128
- """Publication details of a journal or venue."""
73
+ class PictureDescriptionData(BasePictureData):
74
+ """PictureDescriptionData."""
129
75
 
130
- identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
131
- default=None,
132
- description="Unique identifiers of a publication venue.",
133
- )
134
- name: StrictStr = Field(
135
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
136
- description="Name of the publication.",
137
- )
138
- alternate_names: Optional[list[StrictStr]] = Field(
139
- default=None,
140
- json_schema_extra=es_field(type="text"),
141
- title="Alternate Names",
142
- description="Other names or abbreviations of this publication.",
143
- )
144
- type: Optional[list[StrictStr]] = Field(
145
- default=None,
146
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
147
- description="Type of publication (journal article, conference, review,...).",
148
- )
149
- pages: Optional[StrictStr] = Field(
150
- default=None,
151
- json_schema_extra=es_field(type="text"),
152
- description="Page range in the publication.",
153
- )
154
- issue: Optional[StrictStr] = Field(
155
- default=None,
156
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
157
- description="Publication issue (issue number).",
158
- )
159
- volume: Optional[StrictStr] = Field(
160
- default=None,
161
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
162
- description="Publication volume.",
163
- )
164
- url: Optional[AnyHttpUrl] = Field(
165
- default=None,
166
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
167
- description="URL on the publication site.",
168
- )
76
+ kind: Literal["description"] = "description"
77
+ text: str
78
+ provenance: str
169
79
 
170
80
 
171
- class DescriptionLicense(BaseModel, extra="forbid"):
172
- """Licence in document description."""
81
+ class PictureMoleculeData(BaseModel):
82
+ """PictureMoleculeData."""
173
83
 
174
- code: Optional[StrictStr] = Field(
175
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
176
- )
177
- text: Optional[StrictStr] = None
84
+ kind: Literal["molecule_data"] = "molecule_data"
178
85
 
86
+ smi: str
87
+ confidence: float
88
+ class_name: str
89
+ segmentation: List[Tuple[float, float]]
90
+ provenance: str
179
91
 
180
- class CCSDocumentDescription(
181
- AliasModel,
182
- Generic[
183
- DescriptionAdvancedT,
184
- DescriptionAnalyticsT,
185
- IdentifierTypeT,
186
- LanguageT,
187
- CollectionNameTypeT,
188
- ],
189
- ):
190
- """Description in document."""
191
-
192
- title: Optional[StrictStr] = None
193
- abstract: Optional[list[StrictStr]] = None
194
- authors: Optional[list[Author]] = None
195
- affiliations: Optional[list[Affiliation]] = None
196
- subjects: Optional[list[str]] = Field(
197
- default=None,
198
- json_schema_extra=es_field(
199
- fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
200
- ),
201
- )
202
- keywords: Optional[list[str]] = Field(
203
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
204
- )
205
- publication_date: Optional[datetime] = None
206
- languages: Optional[list[LanguageT]] = Field(
207
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
208
- )
209
- license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
210
- publishers: Optional[list[StrictStr]] = Field(
211
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
212
- )
213
- url_refs: Optional[list[str]] = Field(
214
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
215
- )
216
- references: Optional[list[Identifier[IdentifierTypeT]]] = None
217
- publication: Optional[list[Publication]] = Field(
218
- default=None, description="List of publication journals or venues."
219
- )
220
- reference_count: Optional[NonNegativeInt] = Field(
221
- default=None,
222
- title="Reference Count",
223
- description="Total number of documents referenced by this document.",
224
- json_schema_extra=es_field(type="integer"),
225
- )
226
- citation_count: Optional[NonNegativeInt] = Field(
227
- default=None,
228
- title="Citation Count",
229
- description=(
230
- "Total number of citations that this document has received (number "
231
- "of documents in whose bibliography this document appears)."
232
- ),
233
- json_schema_extra=es_field(type="integer"),
234
- )
235
- citation_date: Optional[datetime] = Field(
236
- default=None,
237
- title="Citation Count Date",
238
- description="Last update date of the citation count.",
239
- )
240
- advanced: Optional[DescriptionAdvancedT] = None
241
- analytics: Optional[DescriptionAnalyticsT] = None
242
- logs: list[Log]
243
- collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
244
- default=None, description="The collection information of this document."
245
- )
246
- acquisition: Optional[Acquisition] = Field(
247
- default=None,
248
- description=(
249
- "Information on how the document was obtained, for data governance"
250
- " purposes."
251
- ),
252
- )
253
92
 
93
+ class PictureMiscData(BaseModel):
94
+ """PictureMiscData."""
254
95
 
255
- class MinimalDocument(
256
- AliasModel,
257
- Generic[
258
- DescriptionAdvancedT,
259
- DescriptionAnalyticsT,
260
- IdentifierTypeT,
261
- LanguageT,
262
- CollectionNameTypeT,
263
- ],
264
- ):
265
- """Minimal model for a document."""
266
-
267
- name: StrictStr = Field(alias="_name")
268
- obj_type: Optional[StrictStr] = Field("document", alias="type")
269
- description: CCSDocumentDescription[
270
- DescriptionAdvancedT,
271
- DescriptionAnalyticsT,
272
- IdentifierTypeT,
273
- LanguageT,
274
- CollectionNameTypeT,
275
- ]
276
- file_info: FileInfoObject = Field(alias="file-info")
277
- main_text: Optional[list[Union[Ref, BaseText]]] = Field(
278
- default=None, alias="main-text"
279
- )
280
- figures: Optional[list[Figure]] = None
281
- tables: Optional[list[Table]] = None
282
-
283
-
284
- class CCSDocument(
285
- MinimalDocument,
286
- Generic[
287
- DescriptionAdvancedT,
288
- DescriptionAnalyticsT,
289
- IdentifierTypeT,
290
- LanguageT,
291
- CollectionNameTypeT,
96
+ kind: Literal["misc"] = "misc"
97
+ content: Dict[str, Any]
98
+
99
+
100
+ PictureDataType = Annotated[
101
+ Union[
102
+ PictureClassificationData,
103
+ PictureDescriptionData,
104
+ PictureMoleculeData,
105
+ PictureMiscData,
292
106
  ],
293
- ):
294
- """Model for a CCS-generated document."""
295
-
296
- obj_type: Optional[StrictStr] = Field("pdf-document", alias="type")
297
- bitmaps: Optional[list[BitmapObject]] = None
298
- equations: Optional[list[BaseCell]] = None
299
- footnotes: Optional[list[BaseText]] = None
300
- file_info: CCSFileInfoObject = Field(alias="file-info")
301
- main_text: Optional[list[Union[Ref, BaseText]]] = Field(
302
- default=None,
303
- alias="main-text",
304
- )
305
- page_dimensions: Optional[list[PageDimensions]] = Field(
306
- default=None, alias="page-dimensions"
307
- )
308
- page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
309
- page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
310
- s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
107
+ Field(discriminator="kind"),
108
+ ]
109
+
110
+
111
+ class TableCell(BaseModel):
112
+ """TableCell."""
113
+
114
+ bbox: Optional[BoundingBox] = None
115
+ row_span: int = 1
116
+ col_span: int = 1
117
+ start_row_offset_idx: int
118
+ end_row_offset_idx: int
119
+ start_col_offset_idx: int
120
+ end_col_offset_idx: int
121
+ text: str
122
+ column_header: bool = False
123
+ row_header: bool = False
124
+ row_section: bool = False
311
125
 
312
126
  @model_validator(mode="before")
313
127
  @classmethod
314
- def from_dict(cls, data):
315
- """Validates and fixes the input data."""
316
- if not isinstance(data, dict):
317
- return data
318
- description_collection = data["description"].get("collection")
319
- if not description_collection:
320
- data["description"].setdefault("collection", {})
321
-
322
- data["description"]["collection"].setdefault("type", "Document")
323
- logs = data["description"].get("logs")
324
- if not logs:
325
- data["description"].setdefault("logs", [])
326
-
327
- abstract = data["description"].get("abstract")
328
- if abstract is not None and not isinstance(abstract, list):
329
- if isinstance(abstract, str):
330
- data["description"]["abstract"] = [abstract]
331
- else:
332
- data["description"].pop("abstract")
128
+ def from_dict_format(cls, data: Any) -> Any:
129
+ """from_dict_format."""
130
+ if isinstance(data, Dict):
131
+ # Check if this is a native BoundingBox or a bbox from docling-ibm-models
132
+ if (
133
+ # "bbox" not in data
134
+ # or data["bbox"] is None
135
+ # or isinstance(data["bbox"], BoundingBox)
136
+ "text"
137
+ in data
138
+ ):
139
+ return data
140
+ text = data["bbox"].get("token", "")
141
+ if not len(text):
142
+ text_cells = data.pop("text_cell_bboxes", None)
143
+ if text_cells:
144
+ for el in text_cells:
145
+ text += el["token"] + " "
146
+
147
+ text = text.strip()
148
+ data["text"] = text
333
149
 
334
- for key in ["affiliations", "authors"]:
335
- descr = data["description"].get(key)
336
- if descr is not None and not isinstance(descr, list):
337
- if isinstance(descr, dict):
338
- data["description"][key] = [descr]
339
- else:
340
- data["description"].pop(key)
150
+ return data
341
151
 
342
- if data.get("main-text"):
343
- for item in data["main-text"]:
344
- if ref := item.pop("__ref", None):
345
- item["$ref"] = ref
346
152
 
347
- return data
153
+ class TableData(BaseModel): # TBD
154
+ """BaseTableData."""
348
155
 
156
+ table_cells: List[TableCell] = []
157
+ num_rows: int = 0
158
+ num_cols: int = 0
349
159
 
350
- class ExportedCCSDocument(
351
- MinimalDocument,
352
- Generic[
353
- DescriptionAdvancedT,
354
- DescriptionAnalyticsT,
355
- IdentifierTypeT,
356
- LanguageT,
357
- CollectionNameTypeT,
358
- ],
359
- ):
360
- """Document model for Docling."""
160
+ @computed_field # type: ignore
161
+ @property
162
+ def grid(
163
+ self,
164
+ ) -> List[List[TableCell]]:
165
+ """grid."""
166
+ # Initialise empty table data grid (only empty cells)
167
+ table_data = [
168
+ [
169
+ TableCell(
170
+ text="",
171
+ start_row_offset_idx=i,
172
+ end_row_offset_idx=i + 1,
173
+ start_col_offset_idx=j,
174
+ end_col_offset_idx=j + 1,
175
+ )
176
+ for j in range(self.num_cols)
177
+ ]
178
+ for i in range(self.num_rows)
179
+ ]
180
+
181
+ # Overwrite cells in table data for which there is actual cell content.
182
+ for cell in self.table_cells:
183
+ for i in range(
184
+ min(cell.start_row_offset_idx, self.num_rows),
185
+ min(cell.end_row_offset_idx, self.num_rows),
186
+ ):
187
+ for j in range(
188
+ min(cell.start_col_offset_idx, self.num_cols),
189
+ min(cell.end_col_offset_idx, self.num_cols),
190
+ ):
191
+ table_data[i][j] = cell
192
+
193
+ return table_data
361
194
 
362
- obj_type: Optional[StrictStr] = Field(
363
- "pdf-document",
364
- alias="type",
365
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
195
+
196
+ class DocumentOrigin(BaseModel):
197
+ """FileSource."""
198
+
199
+ mimetype: str # the mimetype of the original file
200
+ binary_hash: Uint64 # the binary hash of the original file.
201
+ # TODO: Change to be Uint64 and provide utility method to generate
202
+
203
+ filename: str # The name of the original file, including extension, without path.
204
+ # Could stem from filesystem, source URI, Content-Disposition header, ...
205
+
206
+ uri: Optional[AnyUrl] = (
207
+ None # any possible reference to a source file,
208
+ # from any file handler protocol (e.g. https://, file://, s3://)
366
209
  )
367
- bitmaps: Optional[list[BitmapObject]] = None
368
- equations: Optional[list[BaseCell]] = None
369
- footnotes: Optional[list[BaseText]] = None
370
- description: CCSDocumentDescription[
371
- DescriptionAdvancedT,
372
- DescriptionAnalyticsT,
373
- IdentifierTypeT,
374
- LanguageT,
375
- CollectionNameTypeT,
210
+
211
+ _extra_mimetypes: typing.ClassVar[List[str]] = [
212
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
213
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
214
+ "application/vnd.openxmlformats-officedocument.presentationml.template",
215
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
216
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
217
+ "text/asciidoc",
376
218
  ]
377
- file_info: CCSFileInfoObject = Field(alias="file-info")
378
- main_text: Optional[list[Union[Ref, BaseText]]] = Field(
379
- default=None, alias="main-text"
380
- )
381
- page_dimensions: Optional[list[PageDimensions]] = Field(
382
- default=None, alias="page-dimensions"
219
+
220
+ @field_validator("binary_hash", mode="before")
221
+ @classmethod
222
+ def parse_hex_string(cls, value):
223
+ """parse_hex_string."""
224
+ if isinstance(value, str):
225
+ try:
226
+ # Convert hex string to an integer
227
+ hash_int = Uint64(value, 16)
228
+ # Mask to fit within 64 bits (unsigned)
229
+ return (
230
+ hash_int & 0xFFFFFFFFFFFFFFFF
231
+ ) # TODO be sure it doesn't clip uint64 max
232
+ except ValueError:
233
+ raise ValueError(f"Invalid sha256 hexdigest: {value}")
234
+ return value # If already an int, return it as is.
235
+
236
+ @field_validator("mimetype")
237
+ @classmethod
238
+ def validate_mimetype(cls, v):
239
+ """validate_mimetype."""
240
+ # Check if the provided MIME type is valid using mimetypes module
241
+ if v not in mimetypes.types_map.values() and v not in cls._extra_mimetypes:
242
+ raise ValueError(f"'{v}' is not a valid MIME type")
243
+ return v
244
+
245
+
246
+ class RefItem(BaseModel):
247
+ """RefItem."""
248
+
249
+ cref: str = Field(alias="$ref", pattern=_JSON_POINTER_REGEX)
250
+
251
+ # This method makes RefItem compatible with DocItem
252
+ def get_ref(self):
253
+ """get_ref."""
254
+ return self
255
+
256
+ model_config = ConfigDict(
257
+ populate_by_name=True,
383
258
  )
384
- page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
385
- page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
386
- s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
387
- identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
388
259
 
389
- @model_validator(mode="before")
260
+ def resolve(self, doc: "DoclingDocument"):
261
+ """resolve."""
262
+ path_components = self.cref.split("/")
263
+ if (num_comps := len(path_components)) == 3:
264
+ _, path, index_str = path_components
265
+ index = int(index_str)
266
+ obj = doc.__getattribute__(path)[index]
267
+ elif num_comps == 2:
268
+ _, path = path_components
269
+ obj = doc.__getattribute__(path)
270
+ else:
271
+ raise RuntimeError(f"Unsupported number of path components: {num_comps}")
272
+ return obj
273
+
274
+
275
+ class ImageRef(BaseModel):
276
+ """ImageRef."""
277
+
278
+ mimetype: str
279
+ dpi: int
280
+ size: Size
281
+ uri: AnyUrl
282
+ _pil: Optional[PILImage.Image] = None
283
+
284
+ @property
285
+ def pil_image(self) -> PILImage.Image:
286
+ """Return the PIL Image."""
287
+ if self._pil is not None:
288
+ return self._pil
289
+
290
+ if str(self.uri).startswith("data:"):
291
+ encoded_img = str(self.uri).split(",")[1]
292
+ decoded_img = base64.b64decode(encoded_img)
293
+ self._pil = PILImage.open(BytesIO(decoded_img))
294
+ else:
295
+ self._pil = PILImage.open(str(self.uri))
296
+
297
+ return self._pil
298
+
299
+ @field_validator("mimetype")
390
300
  @classmethod
391
- def from_dict(cls, data):
392
- """Fix ref in main-text."""
393
- if not isinstance(data, dict):
394
- return data
395
- if data.get("main-text"):
396
- for item in data["main-text"]:
397
- if ref := item.pop("__ref", None):
398
- item["$ref"] = ref
301
+ def validate_mimetype(cls, v):
302
+ """validate_mimetype."""
303
+ # Check if the provided MIME type is valid using mimetypes module
304
+ if v not in mimetypes.types_map.values():
305
+ raise ValueError(f"'{v}' is not a valid MIME type")
306
+ return v
399
307
 
400
- return data
308
+ @classmethod
309
+ def from_pil(cls, image: PILImage.Image, dpi: int) -> Self:
310
+ """Construct ImageRef from a PIL Image."""
311
+ buffered = BytesIO()
312
+ image.save(buffered, format="PNG")
313
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
314
+ img_uri = f"data:image/png;base64,{img_str}"
315
+ return cls(
316
+ mimetype="image/png",
317
+ dpi=dpi,
318
+ size=Size(width=image.width, height=image.height),
319
+ uri=img_uri,
320
+ _pil=image,
321
+ )
401
322
 
402
- def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
403
- """Return the resolved reference.
404
323
 
405
- Resolved the Ref object within the document.
406
- If the object is not found, None is returned.
324
+ class ProvenanceItem(BaseModel):
325
+ """ProvenanceItem."""
326
+
327
+ page_no: int
328
+ bbox: BoundingBox
329
+ charspan: Tuple[int, int]
330
+
331
+
332
+ class NodeItem(BaseModel):
333
+ """NodeItem."""
334
+
335
+ self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
336
+ parent: Optional[RefItem] = None
337
+ children: List[RefItem] = []
338
+
339
+ model_config = ConfigDict(extra="forbid")
340
+
341
+ def get_ref(self):
342
+ """get_ref."""
343
+ return RefItem(cref=self.self_ref)
344
+
345
+
346
+ class GroupItem(NodeItem): # Container type, can't be a leaf node
347
+ """GroupItem."""
348
+
349
+ name: str = (
350
+ "group" # Name of the group, e.g. "Introduction Chapter",
351
+ # "Slide 5", "Navigation menu list", ...
352
+ )
353
+ label: GroupLabel = GroupLabel.UNSPECIFIED
354
+
355
+
356
+ class DocItem(
357
+ NodeItem
358
+ ): # Base type for any element that carries content, can be a leaf node
359
+ """DocItem."""
360
+
361
+ label: DocItemLabel
362
+ prov: List[ProvenanceItem] = []
363
+
364
+ def get_location_tokens(
365
+ self,
366
+ doc: "DoclingDocument",
367
+ new_line: str,
368
+ xsize: int = 100,
369
+ ysize: int = 100,
370
+ add_page_index: bool = True,
371
+ ) -> str:
372
+ """Get the location string for the BaseCell."""
373
+ if not len(self.prov):
374
+ return ""
375
+
376
+ location = ""
377
+ for prov in self.prov:
378
+ page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
379
+
380
+ page_i = -1
381
+ if add_page_index:
382
+ page_i = prov.page_no
383
+
384
+ loc_str = DocumentToken.get_location(
385
+ bbox=prov.bbox.to_bottom_left_origin(page_h).as_tuple(),
386
+ page_w=page_w,
387
+ page_h=page_h,
388
+ xsize=xsize,
389
+ ysize=ysize,
390
+ page_i=page_i,
391
+ )
392
+ location += f"{loc_str}{new_line}"
393
+
394
+ return location
395
+
396
+
397
+ class TextItem(DocItem):
398
+ """TextItem."""
399
+
400
+ orig: str # untreated representation
401
+ text: str # sanitized representation
402
+
403
+ def export_to_document_tokens(
404
+ self,
405
+ doc: "DoclingDocument",
406
+ new_line: str = "\n",
407
+ xsize: int = 100,
408
+ ysize: int = 100,
409
+ add_location: bool = True,
410
+ add_content: bool = True,
411
+ add_page_index: bool = True,
412
+ ):
413
+ r"""Export text element to document tokens format.
414
+
415
+ :param doc: "DoclingDocument":
416
+ :param new_line: str: (Default value = "\n")
417
+ :param xsize: int: (Default value = 100)
418
+ :param ysize: int: (Default value = 100)
419
+ :param add_location: bool: (Default value = True)
420
+ :param add_content: bool: (Default value = True)
421
+ :param add_page_index: bool: (Default value = True)
422
+
407
423
  """
408
- result: Optional[Union[BaseCell, BaseText]] = None
409
-
410
- # NOTE: currently only resolves refs explicitely, such that we can make
411
- # assumptions on ref parts
412
- if item.obj_type == "table" and self.tables:
413
- parts = item.ref.split("/")
414
- result = self.tables[int(parts[2])]
415
- elif item.obj_type == "figure" and self.figures:
416
- parts = item.ref.split("/")
417
- result = self.figures[int(parts[2])]
418
- elif item.obj_type == "equation" and self.equations:
419
- parts = item.ref.split("/")
420
- result = self.equations[int(parts[2])]
421
- elif item.obj_type == "footnote" and self.footnotes:
422
- parts = item.ref.split("/")
423
- result = self.footnotes[int(parts[2])]
424
+ body = f"<{self.label.value}>"
424
425
 
425
- return result
426
+ # TODO: This must be done through an explicit mapping.
427
+ # assert DocumentToken.is_known_token(
428
+ # body
429
+ # ), f"failed DocumentToken.is_known_token({body})"
430
+
431
+ if add_location:
432
+ body += self.get_location_tokens(
433
+ doc=doc,
434
+ new_line="",
435
+ xsize=xsize,
436
+ ysize=ysize,
437
+ add_page_index=add_page_index,
438
+ )
439
+
440
+ if add_content and self.text is not None:
441
+ body += self.text.strip()
442
+
443
+ body += f"</{self.label.value}>{new_line}"
444
+
445
+ return body
446
+
447
+
448
+ class SectionHeaderItem(TextItem):
449
+ """SectionItem."""
450
+
451
+ label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
452
+ level: LevelNumber
453
+
454
+
455
+ class ListItem(TextItem):
456
+ """SectionItem."""
457
+
458
+ label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
459
+ enumerated: bool = False
460
+ marker: str # The bullet or number symbol that prefixes this list item
461
+
462
+
463
+ class FloatingItem(DocItem):
464
+ """FloatingItem."""
465
+
466
+ captions: List[RefItem] = []
467
+ references: List[RefItem] = []
468
+ footnotes: List[RefItem] = []
469
+ image: Optional[ImageRef] = None
470
+
471
+ def caption_text(self, doc: "DoclingDocument") -> str:
472
+ """Computes the caption as a single text."""
473
+ text = ""
474
+ for cap in self.captions:
475
+ text += cap.resolve(doc).text
476
+ return text
477
+
478
+
479
+ class PictureItem(FloatingItem):
480
+ """PictureItem."""
481
+
482
+ label: typing.Literal[DocItemLabel.PICTURE] = DocItemLabel.PICTURE
483
+
484
+ annotations: List[PictureDataType] = []
485
+
486
+ def export_to_document_tokens(
487
+ self,
488
+ doc: "DoclingDocument",
489
+ new_line: str = "\n",
490
+ xsize: int = 100,
491
+ ysize: int = 100,
492
+ add_location: bool = True,
493
+ add_caption: bool = True,
494
+ add_content: bool = True, # not used at the moment
495
+ add_page_index: bool = True,
496
+ ):
497
+ r"""Export picture to document tokens format.
498
+
499
+ :param doc: "DoclingDocument":
500
+ :param new_line: str: (Default value = "\n")
501
+ :param xsize: int: (Default value = 100)
502
+ :param ysize: int: (Default value = 100)
503
+ :param add_location: bool: (Default value = True)
504
+ :param add_caption: bool: (Default value = True)
505
+ :param add_content: bool: (Default value = True)
506
+ :param # not used at the momentadd_page_index: bool: (Default value = True)
507
+
508
+ """
509
+ body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
510
+
511
+ if add_location:
512
+ body += self.get_location_tokens(
513
+ doc=doc,
514
+ new_line=new_line,
515
+ xsize=xsize,
516
+ ysize=ysize,
517
+ add_page_index=add_page_index,
518
+ )
519
+
520
+ if add_caption and len(self.captions):
521
+ text = self.caption_text(doc)
522
+
523
+ if len(text):
524
+ body += f"{DocumentToken.BEG_CAPTION.value}"
525
+ body += f"{text.strip()}"
526
+ body += f"{DocumentToken.END_CAPTION.value}"
527
+ body += f"{new_line}"
528
+
529
+ body += f"{DocumentToken.END_FIGURE.value}{new_line}"
530
+
531
+ return body
426
532
 
427
- def get_map_to_page_dimensions(self):
428
- """Get a map from page-index (start at 1) to page-dim [width, height]."""
429
- pagedims = {}
430
533
 
431
- if self.page_dimensions is not None:
432
- for _ in self.page_dimensions:
433
- pagedims[_.page] = [_.width, _.height]
534
+ class TableItem(FloatingItem):
535
+ """TableItem."""
434
536
 
435
- return pagedims
537
+ data: TableData
538
+ label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
539
+
540
+ def export_to_dataframe(self) -> pd.DataFrame:
541
+ """Export the table as a Pandas DataFrame."""
542
+ if self.data.num_rows == 0 or self.data.num_cols == 0:
543
+ return pd.DataFrame()
544
+
545
+ # Count how many rows are column headers
546
+ num_headers = 0
547
+ for i, row in enumerate(self.data.grid):
548
+ if len(row) == 0:
549
+ raise RuntimeError(
550
+ f"Invalid table. {len(row)=} but {self.data.num_cols=}."
551
+ )
552
+
553
+ any_header = False
554
+ for cell in row:
555
+ if cell.column_header:
556
+ any_header = True
557
+ break
558
+
559
+ if any_header:
560
+ num_headers += 1
561
+ else:
562
+ break
563
+
564
+ # Create the column names from all col_headers
565
+ columns: Optional[List[str]] = None
566
+ if num_headers > 0:
567
+ columns = ["" for _ in range(self.data.num_cols)]
568
+ for i in range(num_headers):
569
+ for j, cell in enumerate(self.data.grid[i]):
570
+ col_name = cell.text
571
+ if columns[j] != "":
572
+ col_name = f".{col_name}"
573
+ columns[j] += col_name
574
+
575
+ # Create table data
576
+ table_data = [
577
+ [cell.text for cell in row] for row in self.data.grid[num_headers:]
578
+ ]
579
+
580
+ # Create DataFrame
581
+ df = pd.DataFrame(table_data, columns=columns)
582
+
583
+ return df
584
+
585
+ def export_to_markdown(self) -> str:
586
+ """Export the table as markdown."""
587
+ table = []
588
+ for row in self.data.grid:
589
+ tmp = []
590
+ for col in row:
591
+ tmp.append(col.text)
592
+ table.append(tmp)
593
+
594
+ md_table = ""
595
+ if len(table) > 1 and len(table[0]) > 0:
596
+ try:
597
+ md_table = tabulate(table[1:], headers=table[0], tablefmt="github")
598
+ except ValueError:
599
+ md_table = tabulate(
600
+ table[1:],
601
+ headers=table[0],
602
+ tablefmt="github",
603
+ disable_numparse=True,
604
+ )
605
+ return md_table
606
+
607
+ def export_to_html(self) -> str:
608
+ """Export the table as html."""
609
+ body = ""
610
+ nrows = self.data.num_rows
611
+ ncols = self.data.num_cols
612
+
613
+ if not len(self.data.table_cells):
614
+ return ""
615
+ for i in range(nrows):
616
+ body += "<tr>"
617
+ for j in range(ncols):
618
+ cell: TableCell = self.data.grid[i][j]
619
+
620
+ rowspan, rowstart = (
621
+ cell.row_span,
622
+ cell.start_row_offset_idx,
623
+ )
624
+ colspan, colstart = (
625
+ cell.col_span,
626
+ cell.start_col_offset_idx,
627
+ )
628
+
629
+ if rowstart != i:
630
+ continue
631
+ if colstart != j:
632
+ continue
633
+
634
+ content = cell.text.strip()
635
+ celltag = "td"
636
+ if cell.column_header:
637
+ celltag = "th"
638
+
639
+ opening_tag = f"{celltag}"
640
+ if rowspan > 1:
641
+ opening_tag += f' rowspan="{rowspan}"'
642
+ if colspan > 1:
643
+ opening_tag += f' colspan="{colspan}"'
644
+
645
+ body += f"<{opening_tag}>{content}</{celltag}>"
646
+ body += "</tr>"
647
+ body = f"<table>{body}</table>"
648
+
649
+ return body
650
+
651
+ def export_to_document_tokens(
652
+ self,
653
+ doc: "DoclingDocument",
654
+ new_line: str = "\n",
655
+ xsize: int = 100,
656
+ ysize: int = 100,
657
+ add_location: bool = True,
658
+ add_caption: bool = True,
659
+ add_content: bool = True,
660
+ add_cell_location: bool = True,
661
+ add_cell_label: bool = True,
662
+ add_cell_text: bool = True,
663
+ add_page_index: bool = True,
664
+ ):
665
+ r"""Export table to document tokens format.
666
+
667
+ :param doc: "DoclingDocument":
668
+ :param new_line: str: (Default value = "\n")
669
+ :param xsize: int: (Default value = 100)
670
+ :param ysize: int: (Default value = 100)
671
+ :param add_location: bool: (Default value = True)
672
+ :param add_caption: bool: (Default value = True)
673
+ :param add_content: bool: (Default value = True)
674
+ :param add_cell_location: bool: (Default value = True)
675
+ :param add_cell_label: bool: (Default value = True)
676
+ :param add_cell_text: bool: (Default value = True)
677
+ :param add_page_index: bool: (Default value = True)
678
+
679
+ """
680
+ body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
681
+
682
+ if add_location:
683
+ body += self.get_location_tokens(
684
+ doc=doc,
685
+ new_line=new_line,
686
+ xsize=xsize,
687
+ ysize=ysize,
688
+ add_page_index=add_page_index,
689
+ )
690
+
691
+ if add_caption and len(self.captions):
692
+ text = self.caption_text(doc)
693
+
694
+ if len(text):
695
+ body += f"{DocumentToken.BEG_CAPTION.value}"
696
+ body += f"{text.strip()}"
697
+ body += f"{DocumentToken.END_CAPTION.value}"
698
+ body += f"{new_line}"
699
+
700
+ if add_content and len(self.data.table_cells) > 0:
701
+ for i, row in enumerate(self.data.grid):
702
+ body += f"<row_{i}>"
703
+ for j, col in enumerate(row):
704
+
705
+ text = ""
706
+ if add_cell_text:
707
+ text = col.text.strip()
708
+
709
+ cell_loc = ""
710
+ if (
711
+ col.bbox is not None
712
+ and add_cell_location
713
+ and add_page_index
714
+ and len(self.prov) > 0
715
+ ):
716
+ page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
717
+ cell_loc = DocumentToken.get_location(
718
+ bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
719
+ page_w=page_w,
720
+ page_h=page_h,
721
+ xsize=xsize,
722
+ ysize=ysize,
723
+ page_i=self.prov[0].page_no,
724
+ )
725
+ elif (
726
+ col.bbox is not None
727
+ and add_cell_location
728
+ and not add_page_index
729
+ and len(self.prov) > 0
730
+ ):
731
+ page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
732
+
733
+ cell_loc = DocumentToken.get_location(
734
+ bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
735
+ page_w=page_w,
736
+ page_h=page_h,
737
+ xsize=xsize,
738
+ ysize=ysize,
739
+ page_i=-1,
740
+ )
741
+
742
+ cell_label = ""
743
+ if add_cell_label:
744
+ if col.column_header:
745
+ cell_label = "<col_header>"
746
+ elif col.row_header:
747
+ cell_label = "<row_header>"
748
+ elif col.row_section:
749
+ cell_label = "<row_section>"
750
+ else:
751
+ cell_label = "<body>"
752
+
753
+ body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
754
+
755
+ body += f"</row_{i}>{new_line}"
756
+
757
+ body += f"{DocumentToken.END_TABLE.value}{new_line}"
758
+
759
+ return body
760
+
761
+
762
+ class KeyValueItem(DocItem):
763
+ """KeyValueItem."""
764
+
765
+
766
+ ContentItem = Union[
767
+ TextItem, SectionHeaderItem, ListItem, PictureItem, TableItem, KeyValueItem
768
+ ]
769
+
770
+
771
+ class PageItem(BaseModel):
772
+ """PageItem."""
773
+
774
+ # A page carries separate root items for furniture and body,
775
+ # only referencing items on the page
776
+ size: Size
777
+ image: Optional[ImageRef] = None
778
+ page_no: int
779
+
780
+
781
+ class DoclingDocument(BaseModel):
782
+ """DoclingDocument."""
783
+
784
+ schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
785
+ version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
786
+ CURRENT_VERSION
787
+ )
788
+ name: str # The working name of this document, without extensions
789
+ # (could be taken from originating doc, or just "Untitled 1")
790
+ origin: Optional[DocumentOrigin] = (
791
+ None # DoclingDocuments may specify an origin (converted to DoclingDocument).
792
+ # This is optional, e.g. a DoclingDocument could also be entirely
793
+ # generated from synthetic data.
794
+ )
795
+
796
+ furniture: GroupItem = GroupItem(
797
+ name="_root_", self_ref="#/furniture"
798
+ ) # List[RefItem] = []
799
+ body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
800
+
801
+ groups: List[GroupItem] = []
802
+ texts: List[Union[SectionHeaderItem, ListItem, TextItem]] = []
803
+ pictures: List[PictureItem] = []
804
+ tables: List[TableItem] = []
805
+ key_value_items: List[KeyValueItem] = []
806
+
807
+ pages: Dict[int, PageItem] = {} # empty as default
808
+
809
+ def add_group(
810
+ self,
811
+ label: Optional[GroupLabel] = None,
812
+ name: Optional[str] = None,
813
+ parent: Optional[GroupItem] = None,
814
+ ) -> GroupItem:
815
+ """add_group.
816
+
817
+ :param label: Optional[GroupLabel]: (Default value = None)
818
+ :param name: Optional[str]: (Default value = None)
819
+ :param parent: Optional[GroupItem]: (Default value = None)
820
+
821
+ """
822
+ if not parent:
823
+ parent = self.body
824
+
825
+ group_index = len(self.groups)
826
+ cref = f"#/groups/{group_index}"
827
+
828
+ group = GroupItem(self_ref=cref, parent=parent.get_ref())
829
+ if name is not None:
830
+ group.name = name
831
+ if label is not None:
832
+ group.label = label
833
+
834
+ self.groups.append(group)
835
+ parent.children.append(RefItem(cref=cref))
836
+
837
+ return group
838
+
839
+ def add_list_item(
840
+ self,
841
+ text: str,
842
+ enumerated: bool = False,
843
+ marker: Optional[str] = None,
844
+ orig: Optional[str] = None,
845
+ prov: Optional[ProvenanceItem] = None,
846
+ parent: Optional[GroupItem] = None,
847
+ ):
848
+ """add_paragraph.
849
+
850
+ :param label: str:
851
+ :param text: str:
852
+ :param orig: Optional[str]: (Default value = None)
853
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
854
+ :param parent: Optional[GroupItem]: (Default value = None)
855
+
856
+ """
857
+ if not parent:
858
+ parent = self.body
859
+
860
+ if not orig:
861
+ orig = text
862
+
863
+ marker = marker or "-"
864
+
865
+ text_index = len(self.texts)
866
+ cref = f"#/texts/{text_index}"
867
+ list_item = ListItem(
868
+ text=text,
869
+ orig=orig,
870
+ self_ref=cref,
871
+ parent=parent.get_ref(),
872
+ enumerated=enumerated,
873
+ marker=marker,
874
+ )
875
+ if prov:
876
+ list_item.prov.append(prov)
877
+
878
+ self.texts.append(list_item)
879
+ parent.children.append(RefItem(cref=cref))
880
+
881
+ return list_item
882
+
883
+ def add_text(
884
+ self,
885
+ label: DocItemLabel,
886
+ text: str,
887
+ orig: Optional[str] = None,
888
+ prov: Optional[ProvenanceItem] = None,
889
+ parent: Optional[GroupItem] = None,
890
+ ):
891
+ """add_paragraph.
892
+
893
+ :param label: str:
894
+ :param text: str:
895
+ :param orig: Optional[str]: (Default value = None)
896
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
897
+ :param parent: Optional[GroupItem]: (Default value = None)
898
+
899
+ """
900
+ if not parent:
901
+ parent = self.body
902
+
903
+ if not orig:
904
+ orig = text
905
+
906
+ text_index = len(self.texts)
907
+ cref = f"#/texts/{text_index}"
908
+ text_item = TextItem(
909
+ label=label,
910
+ text=text,
911
+ orig=orig,
912
+ self_ref=cref,
913
+ parent=parent.get_ref(),
914
+ )
915
+ if prov:
916
+ text_item.prov.append(prov)
917
+
918
+ self.texts.append(text_item)
919
+ parent.children.append(RefItem(cref=cref))
920
+
921
+ return text_item
922
+
923
+ def add_table(
924
+ self,
925
+ data: TableData,
926
+ caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
927
+ prov: Optional[ProvenanceItem] = None,
928
+ parent: Optional[GroupItem] = None,
929
+ ):
930
+ """add_table.
931
+
932
+ :param data: BaseTableData:
933
+ :param caption: Optional[Union[TextItem:
934
+ :param RefItem]]: (Default value = None)
935
+ :param # This is not cool yet.prov: Optional[ProvenanceItem]
936
+ :param parent: Optional[GroupItem]: (Default value = None)
937
+
938
+ """
939
+ if not parent:
940
+ parent = self.body
941
+
942
+ table_index = len(self.tables)
943
+ cref = f"#/tables/{table_index}"
944
+
945
+ tbl_item = TableItem(
946
+ label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
947
+ )
948
+ if prov:
949
+ tbl_item.prov.append(prov)
950
+ if caption:
951
+ tbl_item.captions.append(caption.get_ref())
952
+
953
+ self.tables.append(tbl_item)
954
+ parent.children.append(RefItem(cref=cref))
955
+
956
+ return tbl_item
957
+
958
+ def add_picture(
959
+ self,
960
+ annotations: List[PictureDataType] = [],
961
+ image: Optional[ImageRef] = None,
962
+ caption: Optional[Union[TextItem, RefItem]] = None,
963
+ prov: Optional[ProvenanceItem] = None,
964
+ parent: Optional[GroupItem] = None,
965
+ ):
966
+ """add_picture.
967
+
968
+ :param data: List[PictureData]: (Default value = [])
969
+ :param caption: Optional[Union[TextItem:
970
+ :param RefItem]]: (Default value = None)
971
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
972
+ :param parent: Optional[GroupItem]: (Default value = None)
973
+
974
+ """
975
+ if not parent:
976
+ parent = self.body
977
+
978
+ picture_index = len(self.pictures)
979
+ cref = f"#/pictures/{picture_index}"
980
+
981
+ fig_item = PictureItem(
982
+ label=DocItemLabel.PICTURE,
983
+ annotations=annotations,
984
+ image=image,
985
+ self_ref=cref,
986
+ parent=parent.get_ref(),
987
+ )
988
+ if prov:
989
+ fig_item.prov.append(prov)
990
+ if caption:
991
+ fig_item.captions.append(caption.get_ref())
992
+
993
+ self.pictures.append(fig_item)
994
+ parent.children.append(RefItem(cref=cref))
995
+
996
+ return fig_item
997
+
998
+ def add_heading(
999
+ self,
1000
+ text: str,
1001
+ orig: Optional[str] = None,
1002
+ level: LevelNumber = 1,
1003
+ prov: Optional[ProvenanceItem] = None,
1004
+ parent: Optional[GroupItem] = None,
1005
+ ):
1006
+ """add_heading.
1007
+
1008
+ :param label: DocItemLabel:
1009
+ :param text: str:
1010
+ :param orig: Optional[str]: (Default value = None)
1011
+ :param level: LevelNumber: (Default value = 1)
1012
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
1013
+ :param parent: Optional[GroupItem]: (Default value = None)
1014
+
1015
+ """
1016
+ if not parent:
1017
+ parent = self.body
1018
+
1019
+ if not orig:
1020
+ orig = text
1021
+
1022
+ text_index = len(self.texts)
1023
+ cref = f"#/texts/{text_index}"
1024
+ section_header_item = SectionHeaderItem(
1025
+ level=level,
1026
+ text=text,
1027
+ orig=orig,
1028
+ self_ref=cref,
1029
+ parent=parent.get_ref(),
1030
+ )
1031
+ if prov:
1032
+ section_header_item.prov.append(prov)
1033
+
1034
+ self.texts.append(section_header_item)
1035
+ parent.children.append(RefItem(cref=cref))
1036
+
1037
+ return section_header_item
1038
+
1039
+ def num_pages(self):
1040
+ """num_pages."""
1041
+ return len(self.pages.values())
1042
+
1043
+ def validate_tree(self, root) -> bool:
1044
+ """validate_tree."""
1045
+ res = []
1046
+ for child_ref in root.children:
1047
+ child = child_ref.resolve(self)
1048
+ if child.parent.resolve(self) != root:
1049
+ return False
1050
+ res.append(self.validate_tree(child))
1051
+
1052
+ return all(res) or len(res) == 0
1053
+
1054
+ def iterate_items(
1055
+ self,
1056
+ root: Optional[NodeItem] = None,
1057
+ with_groups: bool = False,
1058
+ traverse_pictures: bool = True,
1059
+ page_no: Optional[int] = None,
1060
+ _level: int = 0, # fixed parameter, carries through the node nesting level
1061
+ ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
1062
+ """iterate_elements.
1063
+
1064
+ :param root: Optional[NodeItem]: (Default value = None)
1065
+ :param with_groups: bool: (Default value = False)
1066
+ :param traverse_pictures: bool: (Default value = True)
1067
+ :param page_no: Optional[int]: (Default value = None)
1068
+ :param _level: (Default value = 0)
1069
+ :param # fixed parameter:
1070
+ :param carries through the node nesting level:
1071
+ """
1072
+ if not root:
1073
+ root = self.body
1074
+
1075
+ if not isinstance(root, GroupItem) or with_groups:
1076
+ if isinstance(root, DocItem):
1077
+ if page_no is not None:
1078
+ for prov in root.prov:
1079
+ if prov.page_no == page_no:
1080
+ yield root, _level
1081
+ else:
1082
+ yield root, _level
1083
+ else:
1084
+ yield root, _level
1085
+
1086
+ # Traverse children
1087
+ for child_ref in root.children:
1088
+ child = child_ref.resolve(self)
1089
+
1090
+ if isinstance(child, NodeItem):
1091
+ # If the child is a NodeItem, recursively traverse it
1092
+ if not isinstance(child, PictureItem) or traverse_pictures:
1093
+ yield from self.iterate_items(
1094
+ child, _level=_level + 1, with_groups=with_groups
1095
+ )
1096
+
1097
+ def print_element_tree(self):
1098
+ """print_element_tree."""
1099
+ for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
1100
+ if isinstance(item, GroupItem):
1101
+ print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
1102
+ elif isinstance(item, DocItem):
1103
+ print(" " * level, f"{ix}: {item.label.value}")
1104
+
1105
+ def export_to_dict(self) -> Dict:
1106
+ """export_to_dict."""
1107
+ return self.model_dump(mode="json", by_alias=True, exclude_none=True)
436
1108
 
437
1109
  def export_to_markdown( # noqa: C901
438
1110
  self,
439
1111
  delim: str = "\n\n",
440
- main_text_start: int = 0,
441
- main_text_stop: Optional[int] = None,
442
- main_text_labels: list[str] = [
443
- "title",
444
- "subtitle-level-1",
445
- "paragraph",
446
- "caption",
447
- "table",
448
- "figure",
449
- ],
1112
+ from_element: int = 0,
1113
+ to_element: Optional[int] = None,
1114
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
450
1115
  strict_text: bool = False,
451
1116
  image_placeholder: str = "<!-- image -->",
452
1117
  ) -> str:
@@ -455,66 +1120,77 @@ class ExportedCCSDocument(
455
1120
  Operates on a slice of the document's main_text as defined through arguments
456
1121
  main_text_start and main_text_stop; defaulting to the whole main_text.
457
1122
 
458
- Args:
459
- delim (str, optional): Delimiter to use when concatenating the various
1123
+ :param delim: Delimiter to use when concatenating the various
460
1124
  Markdown parts. Defaults to "\n\n".
461
- main_text_start (int, optional): Main-text slicing start index (inclusive).
1125
+ :type delim: str
1126
+ :param from_element: Body slicing start index (inclusive).
462
1127
  Defaults to 0.
463
- main_text_end (Optional[int], optional): Main-text slicing stop index
1128
+ :type from_element: int
1129
+ :param to_element: Body slicing stop index
464
1130
  (exclusive). Defaults to None.
465
- main_text_labels (list[str], optional): The labels to include in the
466
- markdown.
467
- strict_text (bool, optional): if true, the output will be only plain text
468
- without any markdown styling. Defaults to False.
469
- image_placeholder (str, optional): the placeholder to include to position
470
- images in the markdown. Defaults to a markdown comment "<!-- image -->".
471
-
472
- Returns:
473
- str: The exported Markdown representation.
1131
+ :type to_element: Optional[int]
1132
+ :param delim: str: (Default value = "\n\n")
1133
+ :param from_element: int: (Default value = 0)
1134
+ :param to_element: Optional[int]: (Default value = None)
1135
+ :param labels: set[DocItemLabel]
1136
+ :param "subtitle-level-1":
1137
+ :param "paragraph":
1138
+ :param "caption":
1139
+ :param "table":
1140
+ :param "Text":
1141
+ :param "text":
1142
+ :param ]:
1143
+ :param strict_text: bool: (Default value = False)
1144
+ :param image_placeholder str: (Default value = "<!-- image -->")
1145
+ the placeholder to include to position images in the markdown.
1146
+ :returns: The exported Markdown representation.
1147
+ :rtype: str
474
1148
  """
475
1149
  has_title = False
476
1150
  prev_text = ""
477
1151
  md_texts: list[str] = []
478
1152
 
479
- if self.main_text is not None:
480
- # collect all captions embedded in table and figure objects
481
- # to avoid repeating them
482
- embedded_captions = set()
483
- for orig_item in self.main_text[main_text_start:main_text_stop]:
484
- item = (
485
- self._resolve_ref(orig_item)
486
- if isinstance(orig_item, Ref)
487
- else orig_item
488
- )
489
- if item is None:
490
- continue
491
-
492
- if (
493
- isinstance(item, (Table, Figure))
494
- and item.text
495
- and item.obj_type in main_text_labels
496
- ):
497
- embedded_captions.add(item.text)
498
-
499
- # serialize document to markdown
500
- for orig_item in self.main_text[main_text_start:main_text_stop]:
501
- markdown_text = ""
502
-
503
- item = (
504
- self._resolve_ref(orig_item)
505
- if isinstance(orig_item, Ref)
506
- else orig_item
507
- )
508
- if item is None:
509
- continue
510
-
511
- item_type = item.obj_type
512
- if isinstance(item, BaseText) and item_type in main_text_labels:
1153
+ # collect all captions embedded in table and figure objects
1154
+ # to avoid repeating them
1155
+ embedded_captions = set()
1156
+ skip_count = 0
1157
+ for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1158
+ if skip_count < from_element:
1159
+ skip_count += 1
1160
+ continue # skip as many items as you want
1161
+
1162
+ if to_element and ix >= to_element:
1163
+ break
1164
+
1165
+ if (
1166
+ isinstance(item, (TableItem, PictureItem))
1167
+ and len(item.captions) > 0
1168
+ and item.label in labels
1169
+ ):
1170
+ caption = item.caption_text(self)
1171
+ if caption:
1172
+ embedded_captions.add(caption)
1173
+
1174
+ skip_count = 0
1175
+ for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1176
+ if skip_count < from_element:
1177
+ skip_count += 1
1178
+ continue # skip as many items as you want
1179
+
1180
+ if to_element and ix >= to_element:
1181
+ break
1182
+
1183
+ markdown_text = ""
1184
+
1185
+ if isinstance(item, DocItem):
1186
+ item_type = item.label
1187
+
1188
+ if isinstance(item, TextItem) and item_type in labels:
513
1189
  text = item.text
514
1190
 
515
1191
  # skip captions of they are embedded in the actual
516
1192
  # floating object
517
- if item_type == "caption" and text in embedded_captions:
1193
+ if item_type == DocItemLabel.CAPTION and text in embedded_captions:
518
1194
  continue
519
1195
 
520
1196
  # ignore repeated text
@@ -524,7 +1200,7 @@ class ExportedCCSDocument(
524
1200
  prev_text = text
525
1201
 
526
1202
  # first title match
527
- if item_type == "title" and not has_title:
1203
+ if item_type == DocItemLabel.TITLE and not has_title:
528
1204
  if strict_text:
529
1205
  markdown_text = f"{text}"
530
1206
  else:
@@ -532,78 +1208,89 @@ class ExportedCCSDocument(
532
1208
  has_title = True
533
1209
 
534
1210
  # secondary titles
535
- elif item_type in {"title", "subtitle-level-1"} or (
536
- has_title and item_type == "title"
537
- ):
1211
+ elif item_type in {
1212
+ DocItemLabel.TITLE,
1213
+ DocItemLabel.SECTION_HEADER,
1214
+ } or (has_title and item_type == DocItemLabel.TITLE):
538
1215
  if strict_text:
539
1216
  markdown_text = f"{text}"
540
1217
  else:
541
1218
  markdown_text = f"## {text}"
542
1219
 
1220
+ # secondary titles
1221
+ elif isinstance(item, ListItem):
1222
+ if item.enumerated:
1223
+ marker = item.marker
1224
+ else:
1225
+ marker = "-"
1226
+
1227
+ markdown_text = f"{marker} {text}"
1228
+
543
1229
  # normal text
544
1230
  else:
545
1231
  markdown_text = text
546
1232
 
547
- elif (
548
- isinstance(item, Table)
549
- and item.data
550
- and item_type in main_text_labels
551
- ):
1233
+ elif isinstance(item, TableItem) and item.data and item_type in labels:
1234
+ parts = []
552
1235
 
553
- md_table = ""
554
- table = []
555
- for row in item.data:
556
- tmp = []
557
- for col in row:
558
- tmp.append(col.text)
559
- table.append(tmp)
560
-
561
- if len(table) > 1 and len(table[0]) > 0:
562
- try:
563
- md_table = tabulate(
564
- table[1:], headers=table[0], tablefmt="github"
565
- )
566
- except ValueError:
567
- md_table = tabulate(
568
- table[1:],
569
- headers=table[0],
570
- tablefmt="github",
571
- disable_numparse=True,
572
- )
573
-
574
- markdown_text = ""
575
- if item.text:
576
- markdown_text = item.text
1236
+ # Compute the caption
1237
+ if caption := item.caption_text(self):
1238
+ parts.append(caption)
1239
+ parts.append("\n")
1240
+
1241
+ # Rendered the item
577
1242
  if not strict_text:
578
- markdown_text += "\n" + md_table
1243
+ md_table = item.export_to_markdown()
1244
+ if md_table:
1245
+ parts.append(item.export_to_markdown())
1246
+
1247
+ # Combine parts
1248
+ markdown_text = "\n".join(parts)
579
1249
 
580
- elif isinstance(item, Figure) and item_type in main_text_labels:
1250
+ elif isinstance(item, PictureItem) and item_type in labels:
1251
+ parts = []
581
1252
 
582
- markdown_text = ""
583
- if item.text:
584
- markdown_text = item.text
1253
+ # Compute the caption
1254
+ if caption := item.caption_text(self):
1255
+ parts.append(caption)
1256
+ parts.append("\n")
1257
+
1258
+ # Rendered the item
585
1259
  if not strict_text:
586
- markdown_text += f"\n{image_placeholder}"
1260
+ parts.append(f"{image_placeholder}")
1261
+
1262
+ # Combine parts
1263
+ markdown_text = "\n".join(parts)
587
1264
 
588
- if markdown_text:
589
- md_texts.append(markdown_text)
1265
+ if markdown_text:
1266
+ md_texts.append(markdown_text)
590
1267
 
591
1268
  result = delim.join(md_texts)
592
1269
  return result
593
1270
 
1271
+ def export_to_text( # noqa: C901
1272
+ self,
1273
+ delim: str = "\n\n",
1274
+ from_element: int = 0,
1275
+ to_element: Optional[int] = None,
1276
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1277
+ ) -> str:
1278
+ """export_to_text."""
1279
+ return self.export_to_markdown(
1280
+ delim,
1281
+ from_element,
1282
+ to_element,
1283
+ labels,
1284
+ strict_text=True,
1285
+ image_placeholder="",
1286
+ )
1287
+
594
1288
  def export_to_document_tokens(
595
1289
  self,
596
1290
  delim: str = "\n\n",
597
- main_text_start: int = 0,
598
- main_text_stop: Optional[int] = None,
599
- main_text_labels: list[str] = [
600
- "title",
601
- "subtitle-level-1",
602
- "paragraph",
603
- "caption",
604
- "table",
605
- "figure",
606
- ],
1291
+ from_element: int = 0,
1292
+ to_element: Optional[int] = None,
1293
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
607
1294
  xsize: int = 100,
608
1295
  ysize: int = 100,
609
1296
  add_location: bool = True,
@@ -616,11 +1303,23 @@ class ExportedCCSDocument(
616
1303
  ) -> str:
617
1304
  r"""Exports the document content to an DocumentToken format.
618
1305
 
619
- Operates on a slice of the document's main_text as defined through arguments
620
- main_text_start and main_text_stop; defaulting to the whole main_text.
621
-
622
- Returns:
623
- str: The content of the document formatted as a DocTags string.
1306
+ Operates on a slice of the document's body as defined through arguments
1307
+ from_element and to_element; defaulting to the whole main_text.
1308
+
1309
+ :param delim: str: (Default value = "\n\n")
1310
+ :param from_element: int: (Default value = 0)
1311
+ :param to_element: Optional[int]: (Default value = None)
1312
+ :param labels: set[DocItemLabel]
1313
+ :param xsize: int: (Default value = 100)
1314
+ :param ysize: int: (Default value = 100)
1315
+ :param add_location: bool: (Default value = True)
1316
+ :param add_content: bool: (Default value = True)
1317
+ :param add_page_index: bool: (Default value = True)
1318
+ :param # table specific flagsadd_table_cell_location: bool
1319
+ :param add_table_cell_label: bool: (Default value = True)
1320
+ :param add_table_cell_text: bool: (Default value = True)
1321
+ :returns: The content of the document formatted as a DocTags string.
1322
+ :rtype: str
624
1323
  """
625
1324
  new_line = ""
626
1325
  if delim:
@@ -630,82 +1329,113 @@ class ExportedCCSDocument(
630
1329
 
631
1330
  # pagedims = self.get_map_to_page_dimensions()
632
1331
 
633
- if self.main_text is not None:
634
- for orig_item in self.main_text[main_text_start:main_text_stop]:
1332
+ skip_count = 0
1333
+ for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1334
+ if skip_count < from_element:
1335
+ skip_count += 1
1336
+ continue # skip as many items as you want
635
1337
 
636
- item = (
637
- self._resolve_ref(orig_item)
638
- if isinstance(orig_item, Ref)
639
- else orig_item
640
- )
1338
+ if to_element and ix >= to_element:
1339
+ break
641
1340
 
642
- if item is None:
643
- continue
1341
+ if not isinstance(item, DocItem):
1342
+ continue
644
1343
 
645
- prov = item.prov
1344
+ prov = item.prov
646
1345
 
647
- page_i = -1
648
- page_w = 0.0
649
- page_h = 0.0
1346
+ page_i = -1
650
1347
 
651
- if (
652
- add_location
653
- and self.page_dimensions is not None
654
- and prov is not None
655
- and len(prov) > 0
656
- ):
1348
+ if add_location and len(self.pages) and len(prov) > 0:
657
1349
 
658
- page_i = prov[0].page
659
- page_dim = self.page_dimensions[page_i - 1]
1350
+ page_i = prov[0].page_no
1351
+ page_dim = self.pages[page_i].size
660
1352
 
661
- page_w = float(page_dim.width)
662
- page_h = float(page_dim.height)
1353
+ float(page_dim.width)
1354
+ float(page_dim.height)
663
1355
 
664
- item_type = item.obj_type
665
- if isinstance(item, BaseText) and (item_type in main_text_labels):
1356
+ item_type = item.label
1357
+ if isinstance(item, TextItem) and (item_type in labels):
666
1358
 
667
- doctags += item.export_to_document_tokens(
668
- new_line=new_line,
669
- page_w=page_w,
670
- page_h=page_h,
671
- xsize=xsize,
672
- ysize=ysize,
673
- add_location=add_location,
674
- add_content=add_content,
675
- add_page_index=add_page_index,
676
- )
1359
+ doctags += item.export_to_document_tokens(
1360
+ doc=self,
1361
+ new_line=new_line,
1362
+ xsize=xsize,
1363
+ ysize=ysize,
1364
+ add_location=add_location,
1365
+ add_content=add_content,
1366
+ add_page_index=add_page_index,
1367
+ )
677
1368
 
678
- elif isinstance(item, Table) and (item_type in main_text_labels):
679
-
680
- doctags += item.export_to_document_tokens(
681
- new_line=new_line,
682
- page_w=page_w,
683
- page_h=page_h,
684
- xsize=xsize,
685
- ysize=ysize,
686
- add_caption=True,
687
- add_location=add_location,
688
- add_content=add_content,
689
- add_cell_location=add_table_cell_location,
690
- add_cell_label=add_table_cell_label,
691
- add_cell_text=add_table_cell_text,
692
- add_page_index=add_page_index,
693
- )
1369
+ elif isinstance(item, TableItem) and (item_type in labels):
1370
+
1371
+ doctags += item.export_to_document_tokens(
1372
+ doc=self,
1373
+ new_line=new_line,
1374
+ xsize=xsize,
1375
+ ysize=ysize,
1376
+ add_caption=True,
1377
+ add_location=add_location,
1378
+ add_content=add_content,
1379
+ add_cell_location=add_table_cell_location,
1380
+ add_cell_label=add_table_cell_label,
1381
+ add_cell_text=add_table_cell_text,
1382
+ add_page_index=add_page_index,
1383
+ )
694
1384
 
695
- elif isinstance(item, Figure) and (item_type in main_text_labels):
696
-
697
- doctags += item.export_to_document_tokens(
698
- new_line=new_line,
699
- page_w=page_w,
700
- page_h=page_h,
701
- xsize=xsize,
702
- ysize=ysize,
703
- add_caption=True,
704
- add_location=add_location,
705
- add_content=add_content,
706
- add_page_index=add_page_index,
707
- )
1385
+ elif isinstance(item, PictureItem) and (item_type in labels):
1386
+
1387
+ doctags += item.export_to_document_tokens(
1388
+ doc=self,
1389
+ new_line=new_line,
1390
+ xsize=xsize,
1391
+ ysize=ysize,
1392
+ add_caption=True,
1393
+ add_location=add_location,
1394
+ add_content=add_content,
1395
+ add_page_index=add_page_index,
1396
+ )
708
1397
 
709
1398
  doctags += DocumentToken.END_DOCUMENT.value
710
1399
 
711
1400
  return doctags
1401
+
1402
+ def add_page(
1403
+ self, page_no: int, size: Size, image: Optional[ImageRef] = None
1404
+ ) -> PageItem:
1405
+ """add_page.
1406
+
1407
+ :param page_no: int:
1408
+ :param size: Size:
1409
+
1410
+ """
1411
+ pitem = PageItem(page_no=page_no, size=size, image=image)
1412
+
1413
+ self.pages[page_no] = pitem
1414
+ return pitem
1415
+
1416
+ @field_validator("version")
1417
+ @classmethod
1418
+ def check_version_is_compatible(cls, v: str) -> str:
1419
+ """Check if this document version is compatible with current version."""
1420
+ current_match = re.match(VERSION_PATTERN, CURRENT_VERSION)
1421
+ doc_match = re.match(VERSION_PATTERN, v)
1422
+ if (
1423
+ doc_match is None
1424
+ or current_match is None
1425
+ or doc_match["major"] != current_match["major"]
1426
+ or doc_match["minor"] > current_match["minor"]
1427
+ ):
1428
+ raise ValueError(
1429
+ f"incompatible version {v} with schema version {CURRENT_VERSION}"
1430
+ )
1431
+ else:
1432
+ return CURRENT_VERSION
1433
+
1434
+ @model_validator(mode="after") # type: ignore
1435
+ @classmethod
1436
+ def validate_document(cls, d: "DoclingDocument"):
1437
+ """validate_document."""
1438
+ if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
1439
+ raise ValueError("Document hierachy is inconsistent.")
1440
+
1441
+ return d