docling-core 1.7.2__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (36) hide show
  1. docling_core/transforms/chunker/__init__.py +2 -8
  2. docling_core/transforms/chunker/base.py +27 -40
  3. docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
  4. docling_core/types/__init__.py +12 -8
  5. docling_core/types/doc/__init__.py +25 -0
  6. docling_core/types/doc/base.py +136 -451
  7. docling_core/types/doc/document.py +1288 -559
  8. docling_core/types/{experimental → doc}/labels.py +4 -1
  9. docling_core/types/legacy_doc/__init__.py +6 -0
  10. docling_core/types/legacy_doc/base.py +485 -0
  11. docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
  12. docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
  13. docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
  14. docling_core/types/legacy_doc/document.py +715 -0
  15. docling_core/types/rec/subject.py +1 -1
  16. docling_core/utils/generate_docs.py +82 -0
  17. docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
  18. docling_core/utils/validators.py +3 -3
  19. {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/METADATA +10 -10
  20. {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/RECORD +24 -31
  21. docling_core-2.0.0.dist-info/entry_points.txt +5 -0
  22. docling_core/transforms/id_generator/__init__.py +0 -12
  23. docling_core/transforms/id_generator/base.py +0 -30
  24. docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
  25. docling_core/transforms/id_generator/uuid_generator.py +0 -34
  26. docling_core/transforms/metadata_extractor/__init__.py +0 -13
  27. docling_core/transforms/metadata_extractor/base.py +0 -59
  28. docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
  29. docling_core/types/experimental/__init__.py +0 -30
  30. docling_core/types/experimental/base.py +0 -167
  31. docling_core/types/experimental/document.py +0 -1192
  32. docling_core/utils/ds_generate_docs.py +0 -144
  33. docling_core-1.7.2.dist-info/entry_points.txt +0 -5
  34. /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
  35. {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/LICENSE +0 -0
  36. {docling_core-1.7.2.dist-info → docling_core-2.0.0.dist-info}/WHEEL +0 -0
@@ -1,452 +1,1116 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
1
  """Models for the Docling Document data type."""
7
2
 
8
- from datetime import datetime
9
- from typing import Generic, Optional, Union
3
+ import base64
4
+ import mimetypes
5
+ import re
6
+ import typing
7
+ from io import BytesIO
8
+ from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
10
9
 
10
+ import pandas as pd
11
+ from PIL import Image as PILImage
11
12
  from pydantic import (
12
- AnyHttpUrl,
13
+ AnyUrl,
13
14
  BaseModel,
15
+ ConfigDict,
14
16
  Field,
15
- NonNegativeInt,
16
- StrictStr,
17
+ StringConstraints,
18
+ computed_field,
19
+ field_validator,
17
20
  model_validator,
18
21
  )
19
22
  from tabulate import tabulate
23
+ from typing_extensions import Annotated, Self
20
24
 
21
- from docling_core.search.mapping import es_field
22
- from docling_core.types.base import (
23
- Acquisition,
24
- CollectionDocumentInfo,
25
- CollectionNameTypeT,
26
- DescriptionAdvancedT,
27
- DescriptionAnalyticsT,
28
- FileInfoObject,
29
- Identifier,
30
- IdentifierTypeT,
31
- LanguageT,
32
- Log,
33
- )
34
- from docling_core.types.doc.base import (
35
- BaseCell,
36
- BaseText,
37
- BitmapObject,
38
- Figure,
39
- PageDimensions,
40
- PageReference,
41
- Ref,
42
- S3Data,
43
- Table,
44
- )
45
- from docling_core.types.doc.tokens import DocumentToken
46
- from docling_core.utils.alias import AliasModel
25
+ from docling_core.search.package import VERSION_PATTERN
26
+ from docling_core.types.base import _JSON_POINTER_REGEX
27
+ from docling_core.types.doc import BoundingBox, Size
28
+ from docling_core.types.doc.labels import DocItemLabel, GroupLabel
29
+ from docling_core.types.legacy_doc.tokens import DocumentToken
47
30
 
31
+ Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
32
+ LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
33
+ CURRENT_VERSION: Final = "1.0.0"
48
34
 
49
- class CCSFileInfoDescription(BaseModel, extra="forbid"):
50
- """File info description."""
35
+ DEFAULT_EXPORT_LABELS = {
36
+ DocItemLabel.TITLE,
37
+ DocItemLabel.DOCUMENT_INDEX,
38
+ DocItemLabel.SECTION_HEADER,
39
+ DocItemLabel.PARAGRAPH,
40
+ DocItemLabel.CAPTION,
41
+ DocItemLabel.TABLE,
42
+ DocItemLabel.PICTURE,
43
+ DocItemLabel.FORMULA,
44
+ DocItemLabel.CHECKBOX_UNSELECTED,
45
+ DocItemLabel.CHECKBOX_SELECTED,
46
+ DocItemLabel.TEXT,
47
+ DocItemLabel.LIST_ITEM,
48
+ DocItemLabel.CODE,
49
+ }
51
50
 
52
- author: Optional[list[StrictStr]] = None
53
- keywords: Optional[str] = None
54
- subject: Optional[str] = None
55
- title: Optional[StrictStr] = None
56
- creation_date: Optional[str] = None # datetime
57
51
 
52
+ class BasePictureData(BaseModel):
53
+ """BasePictureData."""
58
54
 
59
- class CCSFileInfoObject(FileInfoObject, extra="forbid"):
60
- """File info object."""
55
+ kind: str
61
56
 
62
- num_pages: Optional[int] = Field(default=None, alias="#-pages")
63
57
 
64
- collection_name: Optional[str] = Field(
65
- default=None,
66
- alias="collection-name",
67
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
68
- )
69
- description: Optional[CCSFileInfoDescription] = Field(
70
- default=None, json_schema_extra=es_field(suppress=True)
71
- )
72
- page_hashes: Optional[list[PageReference]] = Field(
73
- default=None, alias="page-hashes"
74
- )
58
+ class PictureClassificationClass(BaseModel):
59
+ """PictureClassificationData."""
75
60
 
61
+ class_name: str
62
+ confidence: float
76
63
 
77
- class Affiliation(BaseModel, extra="forbid"):
78
- """Affiliation."""
79
-
80
- name: str = Field(
81
- ...,
82
- json_schema_extra=es_field(
83
- fields={
84
- "lower": {
85
- "normalizer": "lowercase_asciifolding",
86
- "type": "keyword",
87
- "ignore_above": 8191,
88
- },
89
- "keyword": {"type": "keyword", "ignore_above": 8191},
90
- },
91
- ),
92
- )
93
- id: Optional[str] = Field(
94
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
95
- )
96
- source: Optional[str] = Field(
97
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
98
- )
99
64
 
65
+ class PictureClassificationData(BasePictureData):
66
+ """PictureClassificationData."""
100
67
 
101
- class Author(BaseModel, extra="forbid"):
102
- """Author."""
103
-
104
- name: str = Field(
105
- ...,
106
- json_schema_extra=es_field(
107
- type="text",
108
- fields={
109
- "lower": {
110
- "normalizer": "lowercase_asciifolding",
111
- "type": "keyword",
112
- "ignore_above": 8191,
113
- },
114
- "keyword": {"type": "keyword", "ignore_above": 8191},
115
- },
116
- ),
117
- )
118
- id: Optional[str] = Field(
119
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
120
- )
121
- source: Optional[str] = Field(
122
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
123
- )
124
- affiliations: Optional[list[Affiliation]] = None
68
+ kind: Literal["classification"] = "classification"
69
+ provenance: str
70
+ predicted_classes: List[PictureClassificationClass]
125
71
 
126
72
 
127
- class Publication(BaseModel, Generic[IdentifierTypeT], extra="forbid"):
128
- """Publication details of a journal or venue."""
73
+ class PictureDescriptionData(BasePictureData):
74
+ """PictureDescriptionData."""
129
75
 
130
- identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
131
- default=None,
132
- description="Unique identifiers of a publication venue.",
133
- )
134
- name: StrictStr = Field(
135
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
136
- description="Name of the publication.",
137
- )
138
- alternate_names: Optional[list[StrictStr]] = Field(
139
- default=None,
140
- json_schema_extra=es_field(type="text"),
141
- title="Alternate Names",
142
- description="Other names or abbreviations of this publication.",
143
- )
144
- type: Optional[list[StrictStr]] = Field(
145
- default=None,
146
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
147
- description="Type of publication (journal article, conference, review,...).",
148
- )
149
- pages: Optional[StrictStr] = Field(
150
- default=None,
151
- json_schema_extra=es_field(type="text"),
152
- description="Page range in the publication.",
153
- )
154
- issue: Optional[StrictStr] = Field(
155
- default=None,
156
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
157
- description="Publication issue (issue number).",
158
- )
159
- volume: Optional[StrictStr] = Field(
160
- default=None,
161
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
162
- description="Publication volume.",
163
- )
164
- url: Optional[AnyHttpUrl] = Field(
165
- default=None,
166
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
167
- description="URL on the publication site.",
168
- )
76
+ kind: Literal["description"] = "description"
77
+ text: str
78
+ provenance: str
169
79
 
170
80
 
171
- class DescriptionLicense(BaseModel, extra="forbid"):
172
- """Licence in document description."""
81
+ class PictureMoleculeData(BaseModel):
82
+ """PictureMoleculeData."""
173
83
 
174
- code: Optional[StrictStr] = Field(
175
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
176
- )
177
- text: Optional[StrictStr] = None
84
+ kind: Literal["molecule_data"] = "molecule_data"
178
85
 
86
+ smi: str
87
+ confidence: float
88
+ class_name: str
89
+ segmentation: List[Tuple[float, float]]
90
+ provenance: str
179
91
 
180
- class CCSDocumentDescription(
181
- AliasModel,
182
- Generic[
183
- DescriptionAdvancedT,
184
- DescriptionAnalyticsT,
185
- IdentifierTypeT,
186
- LanguageT,
187
- CollectionNameTypeT,
188
- ],
189
- ):
190
- """Description in document."""
191
-
192
- title: Optional[StrictStr] = None
193
- abstract: Optional[list[StrictStr]] = None
194
- authors: Optional[list[Author]] = None
195
- affiliations: Optional[list[Affiliation]] = None
196
- subjects: Optional[list[str]] = Field(
197
- default=None,
198
- json_schema_extra=es_field(
199
- fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
200
- ),
201
- )
202
- keywords: Optional[list[str]] = Field(
203
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
204
- )
205
- publication_date: Optional[datetime] = None
206
- languages: Optional[list[LanguageT]] = Field(
207
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
208
- )
209
- license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
210
- publishers: Optional[list[StrictStr]] = Field(
211
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
212
- )
213
- url_refs: Optional[list[str]] = Field(
214
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
215
- )
216
- references: Optional[list[Identifier[IdentifierTypeT]]] = None
217
- publication: Optional[list[Publication]] = Field(
218
- default=None, description="List of publication journals or venues."
219
- )
220
- reference_count: Optional[NonNegativeInt] = Field(
221
- default=None,
222
- title="Reference Count",
223
- description="Total number of documents referenced by this document.",
224
- json_schema_extra=es_field(type="integer"),
225
- )
226
- citation_count: Optional[NonNegativeInt] = Field(
227
- default=None,
228
- title="Citation Count",
229
- description=(
230
- "Total number of citations that this document has received (number "
231
- "of documents in whose bibliography this document appears)."
232
- ),
233
- json_schema_extra=es_field(type="integer"),
234
- )
235
- citation_date: Optional[datetime] = Field(
236
- default=None,
237
- title="Citation Count Date",
238
- description="Last update date of the citation count.",
239
- )
240
- advanced: Optional[DescriptionAdvancedT] = None
241
- analytics: Optional[DescriptionAnalyticsT] = None
242
- logs: list[Log]
243
- collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
244
- default=None, description="The collection information of this document."
245
- )
246
- acquisition: Optional[Acquisition] = Field(
247
- default=None,
248
- description=(
249
- "Information on how the document was obtained, for data governance"
250
- " purposes."
251
- ),
252
- )
253
92
 
93
+ class PictureMiscData(BaseModel):
94
+ """PictureMiscData."""
254
95
 
255
- class MinimalDocument(
256
- AliasModel,
257
- Generic[
258
- DescriptionAdvancedT,
259
- DescriptionAnalyticsT,
260
- IdentifierTypeT,
261
- LanguageT,
262
- CollectionNameTypeT,
263
- ],
264
- ):
265
- """Minimal model for a document."""
266
-
267
- name: StrictStr = Field(alias="_name")
268
- obj_type: Optional[StrictStr] = Field("document", alias="type")
269
- description: CCSDocumentDescription[
270
- DescriptionAdvancedT,
271
- DescriptionAnalyticsT,
272
- IdentifierTypeT,
273
- LanguageT,
274
- CollectionNameTypeT,
275
- ]
276
- file_info: FileInfoObject = Field(alias="file-info")
277
- main_text: Optional[list[Union[Ref, BaseText]]] = Field(
278
- default=None, alias="main-text"
279
- )
280
- figures: Optional[list[Figure]] = None
281
- tables: Optional[list[Table]] = None
282
-
283
-
284
- class CCSDocument(
285
- MinimalDocument,
286
- Generic[
287
- DescriptionAdvancedT,
288
- DescriptionAnalyticsT,
289
- IdentifierTypeT,
290
- LanguageT,
291
- CollectionNameTypeT,
96
+ kind: Literal["misc"] = "misc"
97
+ content: Dict[str, Any]
98
+
99
+
100
+ PictureDataType = Annotated[
101
+ Union[
102
+ PictureClassificationData,
103
+ PictureDescriptionData,
104
+ PictureMoleculeData,
105
+ PictureMiscData,
292
106
  ],
293
- ):
294
- """Model for a CCS-generated document."""
295
-
296
- obj_type: Optional[StrictStr] = Field("pdf-document", alias="type")
297
- bitmaps: Optional[list[BitmapObject]] = None
298
- equations: Optional[list[BaseCell]] = None
299
- footnotes: Optional[list[BaseText]] = None
300
- file_info: CCSFileInfoObject = Field(alias="file-info")
301
- main_text: Optional[list[Union[Ref, BaseText]]] = Field(
302
- default=None,
303
- alias="main-text",
304
- )
305
- page_dimensions: Optional[list[PageDimensions]] = Field(
306
- default=None, alias="page-dimensions"
307
- )
308
- page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
309
- page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
310
- s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
107
+ Field(discriminator="kind"),
108
+ ]
109
+
110
+
111
+ class TableCell(BaseModel):
112
+ """TableCell."""
113
+
114
+ bbox: Optional[BoundingBox] = None
115
+ row_span: int = 1
116
+ col_span: int = 1
117
+ start_row_offset_idx: int
118
+ end_row_offset_idx: int
119
+ start_col_offset_idx: int
120
+ end_col_offset_idx: int
121
+ text: str
122
+ column_header: bool = False
123
+ row_header: bool = False
124
+ row_section: bool = False
311
125
 
312
126
  @model_validator(mode="before")
313
127
  @classmethod
314
- def from_dict(cls, data):
315
- """Validates and fixes the input data."""
316
- if not isinstance(data, dict):
317
- return data
318
- description_collection = data["description"].get("collection")
319
- if not description_collection:
320
- data["description"].setdefault("collection", {})
321
-
322
- data["description"]["collection"].setdefault("type", "Document")
323
- logs = data["description"].get("logs")
324
- if not logs:
325
- data["description"].setdefault("logs", [])
326
-
327
- abstract = data["description"].get("abstract")
328
- if abstract is not None and not isinstance(abstract, list):
329
- if isinstance(abstract, str):
330
- data["description"]["abstract"] = [abstract]
331
- else:
332
- data["description"].pop("abstract")
128
+ def from_dict_format(cls, data: Any) -> Any:
129
+ """from_dict_format."""
130
+ if isinstance(data, Dict):
131
+ # Check if this is a native BoundingBox or a bbox from docling-ibm-models
132
+ if (
133
+ # "bbox" not in data
134
+ # or data["bbox"] is None
135
+ # or isinstance(data["bbox"], BoundingBox)
136
+ "text"
137
+ in data
138
+ ):
139
+ return data
140
+ text = data["bbox"].get("token", "")
141
+ if not len(text):
142
+ text_cells = data.pop("text_cell_bboxes", None)
143
+ if text_cells:
144
+ for el in text_cells:
145
+ text += el["token"] + " "
146
+
147
+ text = text.strip()
148
+ data["text"] = text
333
149
 
334
- for key in ["affiliations", "authors"]:
335
- descr = data["description"].get(key)
336
- if descr is not None and not isinstance(descr, list):
337
- if isinstance(descr, dict):
338
- data["description"][key] = [descr]
339
- else:
340
- data["description"].pop(key)
150
+ return data
341
151
 
342
- if data.get("main-text"):
343
- for item in data["main-text"]:
344
- if ref := item.pop("__ref", None):
345
- item["$ref"] = ref
346
152
 
347
- return data
153
+ class TableData(BaseModel): # TBD
154
+ """BaseTableData."""
348
155
 
156
+ table_cells: List[TableCell] = []
157
+ num_rows: int = 0
158
+ num_cols: int = 0
349
159
 
350
- class ExportedCCSDocument(
351
- MinimalDocument,
352
- Generic[
353
- DescriptionAdvancedT,
354
- DescriptionAnalyticsT,
355
- IdentifierTypeT,
356
- LanguageT,
357
- CollectionNameTypeT,
358
- ],
359
- ):
360
- """Document model for Docling."""
160
+ @computed_field # type: ignore
161
+ @property
162
+ def grid(
163
+ self,
164
+ ) -> List[List[TableCell]]:
165
+ """grid."""
166
+ # Initialise empty table data grid (only empty cells)
167
+ table_data = [
168
+ [
169
+ TableCell(
170
+ text="",
171
+ start_row_offset_idx=i,
172
+ end_row_offset_idx=i + 1,
173
+ start_col_offset_idx=j,
174
+ end_col_offset_idx=j + 1,
175
+ )
176
+ for j in range(self.num_cols)
177
+ ]
178
+ for i in range(self.num_rows)
179
+ ]
180
+
181
+ # Overwrite cells in table data for which there is actual cell content.
182
+ for cell in self.table_cells:
183
+ for i in range(
184
+ min(cell.start_row_offset_idx, self.num_rows),
185
+ min(cell.end_row_offset_idx, self.num_rows),
186
+ ):
187
+ for j in range(
188
+ min(cell.start_col_offset_idx, self.num_cols),
189
+ min(cell.end_col_offset_idx, self.num_cols),
190
+ ):
191
+ table_data[i][j] = cell
192
+
193
+ return table_data
361
194
 
362
- obj_type: Optional[StrictStr] = Field(
363
- "pdf-document",
364
- alias="type",
365
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
195
+
196
+ class DocumentOrigin(BaseModel):
197
+ """FileSource."""
198
+
199
+ mimetype: str # the mimetype of the original file
200
+ binary_hash: Uint64 # the binary hash of the original file.
201
+ # TODO: Change to be Uint64 and provide utility method to generate
202
+
203
+ filename: str # The name of the original file, including extension, without path.
204
+ # Could stem from filesystem, source URI, Content-Disposition header, ...
205
+
206
+ uri: Optional[AnyUrl] = (
207
+ None # any possible reference to a source file,
208
+ # from any file handler protocol (e.g. https://, file://, s3://)
366
209
  )
367
- bitmaps: Optional[list[BitmapObject]] = None
368
- equations: Optional[list[BaseCell]] = None
369
- footnotes: Optional[list[BaseText]] = None
370
- description: CCSDocumentDescription[
371
- DescriptionAdvancedT,
372
- DescriptionAnalyticsT,
373
- IdentifierTypeT,
374
- LanguageT,
375
- CollectionNameTypeT,
210
+
211
+ _extra_mimetypes: typing.ClassVar[List[str]] = [
212
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
213
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
214
+ "application/vnd.openxmlformats-officedocument.presentationml.template",
215
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
216
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
376
217
  ]
377
- file_info: CCSFileInfoObject = Field(alias="file-info")
378
- main_text: Optional[list[Union[Ref, BaseText]]] = Field(
379
- default=None, alias="main-text"
380
- )
381
- page_dimensions: Optional[list[PageDimensions]] = Field(
382
- default=None, alias="page-dimensions"
218
+
219
+ @field_validator("binary_hash", mode="before")
220
+ @classmethod
221
+ def parse_hex_string(cls, value):
222
+ """parse_hex_string."""
223
+ if isinstance(value, str):
224
+ try:
225
+ # Convert hex string to an integer
226
+ hash_int = Uint64(value, 16)
227
+ # Mask to fit within 64 bits (unsigned)
228
+ return (
229
+ hash_int & 0xFFFFFFFFFFFFFFFF
230
+ ) # TODO be sure it doesn't clip uint64 max
231
+ except ValueError:
232
+ raise ValueError(f"Invalid sha256 hexdigest: {value}")
233
+ return value # If already an int, return it as is.
234
+
235
+ @field_validator("mimetype")
236
+ @classmethod
237
+ def validate_mimetype(cls, v):
238
+ """validate_mimetype."""
239
+ # Check if the provided MIME type is valid using mimetypes module
240
+ if v not in mimetypes.types_map.values() and v not in cls._extra_mimetypes:
241
+ raise ValueError(f"'{v}' is not a valid MIME type")
242
+ return v
243
+
244
+
245
+ class RefItem(BaseModel):
246
+ """RefItem."""
247
+
248
+ cref: str = Field(alias="$ref", pattern=_JSON_POINTER_REGEX)
249
+
250
+ # This method makes RefItem compatible with DocItem
251
+ def get_ref(self):
252
+ """get_ref."""
253
+ return self
254
+
255
+ model_config = ConfigDict(
256
+ populate_by_name=True,
383
257
  )
384
- page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
385
- page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
386
- s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
387
- identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
388
258
 
389
- @model_validator(mode="before")
259
+ def resolve(self, doc: "DoclingDocument"):
260
+ """resolve."""
261
+ path_components = self.cref.split("/")
262
+ if (num_comps := len(path_components)) == 3:
263
+ _, path, index_str = path_components
264
+ index = int(index_str)
265
+ obj = doc.__getattribute__(path)[index]
266
+ elif num_comps == 2:
267
+ _, path = path_components
268
+ obj = doc.__getattribute__(path)
269
+ else:
270
+ raise RuntimeError(f"Unsupported number of path components: {num_comps}")
271
+ return obj
272
+
273
+
274
+ class ImageRef(BaseModel):
275
+ """ImageRef."""
276
+
277
+ mimetype: str
278
+ dpi: int
279
+ size: Size
280
+ uri: AnyUrl
281
+ _pil: Optional[PILImage.Image] = None
282
+
283
+ @property
284
+ def pil_image(self) -> PILImage.Image:
285
+ """Return the PIL Image."""
286
+ if self._pil is not None:
287
+ return self._pil
288
+
289
+ if str(self.uri).startswith("data:"):
290
+ encoded_img = str(self.uri).split(",")[1]
291
+ decoded_img = base64.b64decode(encoded_img)
292
+ self._pil = PILImage.open(BytesIO(decoded_img))
293
+ else:
294
+ self._pil = PILImage.open(str(self.uri))
295
+
296
+ return self._pil
297
+
298
+ @field_validator("mimetype")
390
299
  @classmethod
391
- def from_dict(cls, data):
392
- """Fix ref in main-text."""
393
- if not isinstance(data, dict):
394
- return data
395
- if data.get("main-text"):
396
- for item in data["main-text"]:
397
- if ref := item.pop("__ref", None):
398
- item["$ref"] = ref
300
+ def validate_mimetype(cls, v):
301
+ """validate_mimetype."""
302
+ # Check if the provided MIME type is valid using mimetypes module
303
+ if v not in mimetypes.types_map.values():
304
+ raise ValueError(f"'{v}' is not a valid MIME type")
305
+ return v
399
306
 
400
- return data
307
+ @classmethod
308
+ def from_pil(cls, image: PILImage.Image, dpi: int) -> Self:
309
+ """Construct ImageRef from a PIL Image."""
310
+ buffered = BytesIO()
311
+ image.save(buffered, format="PNG")
312
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
313
+ img_uri = f"data:image/png;base64,{img_str}"
314
+ return cls(
315
+ mimetype="image/png",
316
+ dpi=dpi,
317
+ size=Size(width=image.width, height=image.height),
318
+ uri=img_uri,
319
+ _pil=image,
320
+ )
401
321
 
402
- def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
403
- """Return the resolved reference.
404
322
 
405
- Resolved the Ref object within the document.
406
- If the object is not found, None is returned.
323
+ class ProvenanceItem(BaseModel):
324
+ """ProvenanceItem."""
325
+
326
+ page_no: int
327
+ bbox: BoundingBox
328
+ charspan: Tuple[int, int]
329
+
330
+
331
+ class NodeItem(BaseModel):
332
+ """NodeItem."""
333
+
334
+ self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
335
+ parent: Optional[RefItem] = None
336
+ children: List[RefItem] = []
337
+
338
+ model_config = ConfigDict(extra="forbid")
339
+
340
+ def get_ref(self):
341
+ """get_ref."""
342
+ return RefItem(cref=self.self_ref)
343
+
344
+
345
+ class GroupItem(NodeItem): # Container type, can't be a leaf node
346
+ """GroupItem."""
347
+
348
+ name: str = (
349
+ "group" # Name of the group, e.g. "Introduction Chapter",
350
+ # "Slide 5", "Navigation menu list", ...
351
+ )
352
+ label: GroupLabel = GroupLabel.UNSPECIFIED
353
+
354
+
355
+ class DocItem(
356
+ NodeItem
357
+ ): # Base type for any element that carries content, can be a leaf node
358
+ """DocItem."""
359
+
360
+ label: DocItemLabel
361
+ prov: List[ProvenanceItem] = []
362
+
363
+ def get_location_tokens(
364
+ self,
365
+ doc: "DoclingDocument",
366
+ new_line: str,
367
+ xsize: int = 100,
368
+ ysize: int = 100,
369
+ add_page_index: bool = True,
370
+ ) -> str:
371
+ """Get the location string for the BaseCell."""
372
+ if not len(self.prov):
373
+ return ""
374
+
375
+ location = ""
376
+ for prov in self.prov:
377
+ page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
378
+
379
+ page_i = -1
380
+ if add_page_index:
381
+ page_i = prov.page_no
382
+
383
+ loc_str = DocumentToken.get_location(
384
+ bbox=prov.bbox.to_bottom_left_origin(page_h).as_tuple(),
385
+ page_w=page_w,
386
+ page_h=page_h,
387
+ xsize=xsize,
388
+ ysize=ysize,
389
+ page_i=page_i,
390
+ )
391
+ location += f"{loc_str}{new_line}"
392
+
393
+ return location
394
+
395
+
396
+ class TextItem(DocItem):
397
+ """TextItem."""
398
+
399
+ orig: str # untreated representation
400
+ text: str # sanitized representation
401
+
402
+ def export_to_document_tokens(
403
+ self,
404
+ doc: "DoclingDocument",
405
+ new_line: str = "\n",
406
+ xsize: int = 100,
407
+ ysize: int = 100,
408
+ add_location: bool = True,
409
+ add_content: bool = True,
410
+ add_page_index: bool = True,
411
+ ):
412
+ r"""Export text element to document tokens format.
413
+
414
+ :param doc: "DoclingDocument":
415
+ :param new_line: str: (Default value = "\n")
416
+ :param xsize: int: (Default value = 100)
417
+ :param ysize: int: (Default value = 100)
418
+ :param add_location: bool: (Default value = True)
419
+ :param add_content: bool: (Default value = True)
420
+ :param add_page_index: bool: (Default value = True)
421
+
407
422
  """
408
- result: Optional[Union[BaseCell, BaseText]] = None
409
-
410
- # NOTE: currently only resolves refs explicitely, such that we can make
411
- # assumptions on ref parts
412
- if item.obj_type == "table" and self.tables:
413
- parts = item.ref.split("/")
414
- result = self.tables[int(parts[2])]
415
- elif item.obj_type == "figure" and self.figures:
416
- parts = item.ref.split("/")
417
- result = self.figures[int(parts[2])]
418
- elif item.obj_type == "equation" and self.equations:
419
- parts = item.ref.split("/")
420
- result = self.equations[int(parts[2])]
421
- elif item.obj_type == "footnote" and self.footnotes:
422
- parts = item.ref.split("/")
423
- result = self.footnotes[int(parts[2])]
423
+ body = f"<{self.label.value}>"
424
424
 
425
- return result
425
+ # TODO: This must be done through an explicit mapping.
426
+ # assert DocumentToken.is_known_token(
427
+ # body
428
+ # ), f"failed DocumentToken.is_known_token({body})"
429
+
430
+ if add_location:
431
+ body += self.get_location_tokens(
432
+ doc=doc,
433
+ new_line="",
434
+ xsize=xsize,
435
+ ysize=ysize,
436
+ add_page_index=add_page_index,
437
+ )
438
+
439
+ if add_content and self.text is not None:
440
+ body += self.text.strip()
441
+
442
+ body += f"</{self.label.value}>{new_line}"
443
+
444
+ return body
445
+
446
+
447
+ class SectionHeaderItem(TextItem):
448
+ """SectionItem."""
449
+
450
+ label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
451
+ level: LevelNumber
452
+
453
+
454
+ class ListItem(TextItem):
455
+ """SectionItem."""
456
+
457
+ label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
458
+ enumerated: bool = False
459
+ marker: str # The bullet or number symbol that prefixes this list item
460
+
461
+
462
+ class FloatingItem(DocItem):
463
+ """FloatingItem."""
464
+
465
+ captions: List[RefItem] = []
466
+ references: List[RefItem] = []
467
+ footnotes: List[RefItem] = []
468
+ image: Optional[ImageRef] = None
469
+
470
+ def caption_text(self, doc: "DoclingDocument") -> str:
471
+ """Computes the caption as a single text."""
472
+ text = ""
473
+ for cap in self.captions:
474
+ text += cap.resolve(doc).text
475
+ return text
476
+
477
+
478
+ class PictureItem(FloatingItem):
479
+ """PictureItem."""
480
+
481
+ label: typing.Literal[DocItemLabel.PICTURE] = DocItemLabel.PICTURE
482
+
483
+ annotations: List[PictureDataType] = []
484
+
485
+ def export_to_document_tokens(
486
+ self,
487
+ doc: "DoclingDocument",
488
+ new_line: str = "\n",
489
+ xsize: int = 100,
490
+ ysize: int = 100,
491
+ add_location: bool = True,
492
+ add_caption: bool = True,
493
+ add_content: bool = True, # not used at the moment
494
+ add_page_index: bool = True,
495
+ ):
496
+ r"""Export picture to document tokens format.
497
+
498
+ :param doc: "DoclingDocument":
499
+ :param new_line: str: (Default value = "\n")
500
+ :param xsize: int: (Default value = 100)
501
+ :param ysize: int: (Default value = 100)
502
+ :param add_location: bool: (Default value = True)
503
+ :param add_caption: bool: (Default value = True)
504
+ :param add_content: bool: (Default value = True)
505
+ :param # not used at the momentadd_page_index: bool: (Default value = True)
506
+
507
+ """
508
+ body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
509
+
510
+ if add_location:
511
+ body += self.get_location_tokens(
512
+ doc=doc,
513
+ new_line=new_line,
514
+ xsize=xsize,
515
+ ysize=ysize,
516
+ add_page_index=add_page_index,
517
+ )
518
+
519
+ if add_caption and len(self.captions):
520
+ text = self.caption_text(doc)
521
+
522
+ if len(text):
523
+ body += f"{DocumentToken.BEG_CAPTION.value}"
524
+ body += f"{text.strip()}"
525
+ body += f"{DocumentToken.END_CAPTION.value}"
526
+ body += f"{new_line}"
527
+
528
+ body += f"{DocumentToken.END_FIGURE.value}{new_line}"
529
+
530
+ return body
426
531
 
427
- def get_map_to_page_dimensions(self):
428
- """Get a map from page-index (start at 1) to page-dim [width, height]."""
429
- pagedims = {}
430
532
 
431
- if self.page_dimensions is not None:
432
- for _ in self.page_dimensions:
433
- pagedims[_.page] = [_.width, _.height]
533
+ class TableItem(FloatingItem):
534
+ """TableItem."""
434
535
 
435
- return pagedims
536
+ data: TableData
537
+ label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
538
+
539
+ def export_to_dataframe(self) -> pd.DataFrame:
540
+ """Export the table as a Pandas DataFrame."""
541
+ if self.data.num_rows == 0 or self.data.num_cols == 0:
542
+ return pd.DataFrame()
543
+
544
+ # Count how many rows are column headers
545
+ num_headers = 0
546
+ for i, row in enumerate(self.data.grid):
547
+ if len(row) == 0:
548
+ raise RuntimeError(
549
+ f"Invalid table. {len(row)=} but {self.data.num_cols=}."
550
+ )
551
+
552
+ any_header = False
553
+ for cell in row:
554
+ if cell.column_header:
555
+ any_header = True
556
+ break
557
+
558
+ if any_header:
559
+ num_headers += 1
560
+ else:
561
+ break
562
+
563
+ # Create the column names from all col_headers
564
+ columns: Optional[List[str]] = None
565
+ if num_headers > 0:
566
+ columns = ["" for _ in range(self.data.num_cols)]
567
+ for i in range(num_headers):
568
+ for j, cell in enumerate(self.data.grid[i]):
569
+ col_name = cell.text
570
+ if columns[j] != "":
571
+ col_name = f".{col_name}"
572
+ columns[j] += col_name
573
+
574
+ # Create table data
575
+ table_data = [
576
+ [cell.text for cell in row] for row in self.data.grid[num_headers:]
577
+ ]
578
+
579
+ # Create DataFrame
580
+ df = pd.DataFrame(table_data, columns=columns)
581
+
582
+ return df
583
+
584
+ def export_to_markdown(self) -> str:
585
+ """Export the table as markdown."""
586
+ table = []
587
+ for row in self.data.grid:
588
+ tmp = []
589
+ for col in row:
590
+ tmp.append(col.text)
591
+ table.append(tmp)
592
+
593
+ md_table = ""
594
+ if len(table) > 1 and len(table[0]) > 0:
595
+ try:
596
+ md_table = tabulate(table[1:], headers=table[0], tablefmt="github")
597
+ except ValueError:
598
+ md_table = tabulate(
599
+ table[1:],
600
+ headers=table[0],
601
+ tablefmt="github",
602
+ disable_numparse=True,
603
+ )
604
+ return md_table
605
+
606
+ def export_to_html(self) -> str:
607
+ """Export the table as html."""
608
+ body = ""
609
+ nrows = self.data.num_rows
610
+ ncols = self.data.num_cols
611
+
612
+ if not len(self.data.table_cells):
613
+ return ""
614
+ for i in range(nrows):
615
+ body += "<tr>"
616
+ for j in range(ncols):
617
+ cell: TableCell = self.data.grid[i][j]
618
+
619
+ rowspan, rowstart = (
620
+ cell.row_span,
621
+ cell.start_row_offset_idx,
622
+ )
623
+ colspan, colstart = (
624
+ cell.col_span,
625
+ cell.start_col_offset_idx,
626
+ )
627
+
628
+ if rowstart != i:
629
+ continue
630
+ if colstart != j:
631
+ continue
632
+
633
+ content = cell.text.strip()
634
+ celltag = "td"
635
+ if cell.column_header:
636
+ celltag = "th"
637
+
638
+ opening_tag = f"{celltag}"
639
+ if rowspan > 1:
640
+ opening_tag += f' rowspan="{rowspan}"'
641
+ if colspan > 1:
642
+ opening_tag += f' colspan="{colspan}"'
643
+
644
+ body += f"<{opening_tag}>{content}</{celltag}>"
645
+ body += "</tr>"
646
+ body = f"<table>{body}</table>"
647
+
648
+ return body
649
+
650
+ def export_to_document_tokens(
651
+ self,
652
+ doc: "DoclingDocument",
653
+ new_line: str = "\n",
654
+ xsize: int = 100,
655
+ ysize: int = 100,
656
+ add_location: bool = True,
657
+ add_caption: bool = True,
658
+ add_content: bool = True,
659
+ add_cell_location: bool = True,
660
+ add_cell_label: bool = True,
661
+ add_cell_text: bool = True,
662
+ add_page_index: bool = True,
663
+ ):
664
+ r"""Export table to document tokens format.
665
+
666
+ :param doc: "DoclingDocument":
667
+ :param new_line: str: (Default value = "\n")
668
+ :param xsize: int: (Default value = 100)
669
+ :param ysize: int: (Default value = 100)
670
+ :param add_location: bool: (Default value = True)
671
+ :param add_caption: bool: (Default value = True)
672
+ :param add_content: bool: (Default value = True)
673
+ :param add_cell_location: bool: (Default value = True)
674
+ :param add_cell_label: bool: (Default value = True)
675
+ :param add_cell_text: bool: (Default value = True)
676
+ :param add_page_index: bool: (Default value = True)
677
+
678
+ """
679
+ body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
680
+
681
+ if add_location:
682
+ body += self.get_location_tokens(
683
+ doc=doc,
684
+ new_line=new_line,
685
+ xsize=xsize,
686
+ ysize=ysize,
687
+ add_page_index=add_page_index,
688
+ )
689
+
690
+ if add_caption and len(self.captions):
691
+ text = self.caption_text(doc)
692
+
693
+ if len(text):
694
+ body += f"{DocumentToken.BEG_CAPTION.value}"
695
+ body += f"{text.strip()}"
696
+ body += f"{DocumentToken.END_CAPTION.value}"
697
+ body += f"{new_line}"
698
+
699
+ if add_content and len(self.data.table_cells) > 0:
700
+ for i, row in enumerate(self.data.grid):
701
+ body += f"<row_{i}>"
702
+ for j, col in enumerate(row):
703
+
704
+ text = ""
705
+ if add_cell_text:
706
+ text = col.text.strip()
707
+
708
+ cell_loc = ""
709
+ if (
710
+ col.bbox is not None
711
+ and add_cell_location
712
+ and add_page_index
713
+ and len(self.prov) > 0
714
+ ):
715
+ page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
716
+ cell_loc = DocumentToken.get_location(
717
+ bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
718
+ page_w=page_w,
719
+ page_h=page_h,
720
+ xsize=xsize,
721
+ ysize=ysize,
722
+ page_i=self.prov[0].page_no,
723
+ )
724
+ elif (
725
+ col.bbox is not None
726
+ and add_cell_location
727
+ and not add_page_index
728
+ and len(self.prov) > 0
729
+ ):
730
+ page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
731
+
732
+ cell_loc = DocumentToken.get_location(
733
+ bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
734
+ page_w=page_w,
735
+ page_h=page_h,
736
+ xsize=xsize,
737
+ ysize=ysize,
738
+ page_i=-1,
739
+ )
740
+
741
+ cell_label = ""
742
+ if add_cell_label:
743
+ if col.column_header:
744
+ cell_label = "<col_header>"
745
+ elif col.row_header:
746
+ cell_label = "<row_header>"
747
+ elif col.row_section:
748
+ cell_label = "<row_section>"
749
+ else:
750
+ cell_label = "<body>"
751
+
752
+ body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
753
+
754
+ body += f"</row_{i}>{new_line}"
755
+
756
+ body += f"{DocumentToken.END_TABLE.value}{new_line}"
757
+
758
+ return body
759
+
760
+
761
+ class KeyValueItem(DocItem):
762
+ """KeyValueItem."""
763
+
764
+
765
+ ContentItem = Union[
766
+ TextItem, SectionHeaderItem, ListItem, PictureItem, TableItem, KeyValueItem
767
+ ]
768
+
769
+
770
+ class PageItem(BaseModel):
771
+ """PageItem."""
772
+
773
+ # A page carries separate root items for furniture and body,
774
+ # only referencing items on the page
775
+ size: Size
776
+ image: Optional[ImageRef] = None
777
+ page_no: int
778
+
779
+
780
+ class DoclingDocument(BaseModel):
781
+ """DoclingDocument."""
782
+
783
+ schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
784
+ version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
785
+ CURRENT_VERSION
786
+ )
787
+ name: str # The working name of this document, without extensions
788
+ # (could be taken from originating doc, or just "Untitled 1")
789
+ origin: Optional[DocumentOrigin] = (
790
+ None # DoclingDocuments may specify an origin (converted to DoclingDocument).
791
+ # This is optional, e.g. a DoclingDocument could also be entirely
792
+ # generated from synthetic data.
793
+ )
794
+
795
+ furniture: GroupItem = GroupItem(
796
+ name="_root_", self_ref="#/furniture"
797
+ ) # List[RefItem] = []
798
+ body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
799
+
800
+ groups: List[GroupItem] = []
801
+ texts: List[Union[SectionHeaderItem, ListItem, TextItem]] = []
802
+ pictures: List[PictureItem] = []
803
+ tables: List[TableItem] = []
804
+ key_value_items: List[KeyValueItem] = []
805
+
806
+ pages: Dict[int, PageItem] = {} # empty as default
807
+
808
+ def add_group(
809
+ self,
810
+ label: Optional[GroupLabel] = None,
811
+ name: Optional[str] = None,
812
+ parent: Optional[GroupItem] = None,
813
+ ) -> GroupItem:
814
+ """add_group.
815
+
816
+ :param label: Optional[GroupLabel]: (Default value = None)
817
+ :param name: Optional[str]: (Default value = None)
818
+ :param parent: Optional[GroupItem]: (Default value = None)
819
+
820
+ """
821
+ if not parent:
822
+ parent = self.body
823
+
824
+ group_index = len(self.groups)
825
+ cref = f"#/groups/{group_index}"
826
+
827
+ group = GroupItem(self_ref=cref, parent=parent.get_ref())
828
+ if name is not None:
829
+ group.name = name
830
+ if label is not None:
831
+ group.label = label
832
+
833
+ self.groups.append(group)
834
+ parent.children.append(RefItem(cref=cref))
835
+
836
+ return group
837
+
838
+ def add_list_item(
839
+ self,
840
+ text: str,
841
+ enumerated: bool = False,
842
+ marker: Optional[str] = None,
843
+ orig: Optional[str] = None,
844
+ prov: Optional[ProvenanceItem] = None,
845
+ parent: Optional[GroupItem] = None,
846
+ ):
847
+ """add_paragraph.
848
+
849
+ :param label: str:
850
+ :param text: str:
851
+ :param orig: Optional[str]: (Default value = None)
852
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
853
+ :param parent: Optional[GroupItem]: (Default value = None)
854
+
855
+ """
856
+ if not parent:
857
+ parent = self.body
858
+
859
+ if not orig:
860
+ orig = text
861
+
862
+ marker = marker or "-"
863
+
864
+ text_index = len(self.texts)
865
+ cref = f"#/texts/{text_index}"
866
+ list_item = ListItem(
867
+ text=text,
868
+ orig=orig,
869
+ self_ref=cref,
870
+ parent=parent.get_ref(),
871
+ enumerated=enumerated,
872
+ marker=marker,
873
+ )
874
+ if prov:
875
+ list_item.prov.append(prov)
876
+
877
+ self.texts.append(list_item)
878
+ parent.children.append(RefItem(cref=cref))
879
+
880
+ return list_item
881
+
882
+ def add_text(
883
+ self,
884
+ label: DocItemLabel,
885
+ text: str,
886
+ orig: Optional[str] = None,
887
+ prov: Optional[ProvenanceItem] = None,
888
+ parent: Optional[GroupItem] = None,
889
+ ):
890
+ """add_paragraph.
891
+
892
+ :param label: str:
893
+ :param text: str:
894
+ :param orig: Optional[str]: (Default value = None)
895
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
896
+ :param parent: Optional[GroupItem]: (Default value = None)
897
+
898
+ """
899
+ if not parent:
900
+ parent = self.body
901
+
902
+ if not orig:
903
+ orig = text
904
+
905
+ text_index = len(self.texts)
906
+ cref = f"#/texts/{text_index}"
907
+ text_item = TextItem(
908
+ label=label,
909
+ text=text,
910
+ orig=orig,
911
+ self_ref=cref,
912
+ parent=parent.get_ref(),
913
+ )
914
+ if prov:
915
+ text_item.prov.append(prov)
916
+
917
+ self.texts.append(text_item)
918
+ parent.children.append(RefItem(cref=cref))
919
+
920
+ return text_item
921
+
922
+ def add_table(
923
+ self,
924
+ data: TableData,
925
+ caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
926
+ prov: Optional[ProvenanceItem] = None,
927
+ parent: Optional[GroupItem] = None,
928
+ ):
929
+ """add_table.
930
+
931
+ :param data: BaseTableData:
932
+ :param caption: Optional[Union[TextItem:
933
+ :param RefItem]]: (Default value = None)
934
+ :param # This is not cool yet.prov: Optional[ProvenanceItem]
935
+ :param parent: Optional[GroupItem]: (Default value = None)
936
+
937
+ """
938
+ if not parent:
939
+ parent = self.body
940
+
941
+ table_index = len(self.tables)
942
+ cref = f"#/tables/{table_index}"
943
+
944
+ tbl_item = TableItem(
945
+ label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
946
+ )
947
+ if prov:
948
+ tbl_item.prov.append(prov)
949
+ if caption:
950
+ tbl_item.captions.append(caption.get_ref())
951
+
952
+ self.tables.append(tbl_item)
953
+ parent.children.append(RefItem(cref=cref))
954
+
955
+ return tbl_item
956
+
957
+ def add_picture(
958
+ self,
959
+ annotations: List[PictureDataType] = [],
960
+ image: Optional[ImageRef] = None,
961
+ caption: Optional[Union[TextItem, RefItem]] = None,
962
+ prov: Optional[ProvenanceItem] = None,
963
+ parent: Optional[GroupItem] = None,
964
+ ):
965
+ """add_picture.
966
+
967
+ :param data: List[PictureData]: (Default value = [])
968
+ :param caption: Optional[Union[TextItem:
969
+ :param RefItem]]: (Default value = None)
970
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
971
+ :param parent: Optional[GroupItem]: (Default value = None)
972
+
973
+ """
974
+ if not parent:
975
+ parent = self.body
976
+
977
+ picture_index = len(self.pictures)
978
+ cref = f"#/pictures/{picture_index}"
979
+
980
+ fig_item = PictureItem(
981
+ label=DocItemLabel.PICTURE,
982
+ annotations=annotations,
983
+ image=image,
984
+ self_ref=cref,
985
+ parent=parent.get_ref(),
986
+ )
987
+ if prov:
988
+ fig_item.prov.append(prov)
989
+ if caption:
990
+ fig_item.captions.append(caption.get_ref())
991
+
992
+ self.pictures.append(fig_item)
993
+ parent.children.append(RefItem(cref=cref))
994
+
995
+ return fig_item
996
+
997
+ def add_heading(
998
+ self,
999
+ text: str,
1000
+ orig: Optional[str] = None,
1001
+ level: LevelNumber = 1,
1002
+ prov: Optional[ProvenanceItem] = None,
1003
+ parent: Optional[GroupItem] = None,
1004
+ ):
1005
+ """add_heading.
1006
+
1007
+ :param label: DocItemLabel:
1008
+ :param text: str:
1009
+ :param orig: Optional[str]: (Default value = None)
1010
+ :param level: LevelNumber: (Default value = 1)
1011
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
1012
+ :param parent: Optional[GroupItem]: (Default value = None)
1013
+
1014
+ """
1015
+ if not parent:
1016
+ parent = self.body
1017
+
1018
+ if not orig:
1019
+ orig = text
1020
+
1021
+ text_index = len(self.texts)
1022
+ cref = f"#/texts/{text_index}"
1023
+ section_header_item = SectionHeaderItem(
1024
+ level=level,
1025
+ text=text,
1026
+ orig=orig,
1027
+ self_ref=cref,
1028
+ parent=parent.get_ref(),
1029
+ )
1030
+ if prov:
1031
+ section_header_item.prov.append(prov)
1032
+
1033
+ self.texts.append(section_header_item)
1034
+ parent.children.append(RefItem(cref=cref))
1035
+
1036
+ return section_header_item
1037
+
1038
+ def num_pages(self):
1039
+ """num_pages."""
1040
+ return len(self.pages.values())
1041
+
1042
+ def validate_tree(self, root) -> bool:
1043
+ """validate_tree."""
1044
+ res = []
1045
+ for child_ref in root.children:
1046
+ child = child_ref.resolve(self)
1047
+ if child.parent.resolve(self) != root:
1048
+ return False
1049
+ res.append(self.validate_tree(child))
1050
+
1051
+ return all(res) or len(res) == 0
1052
+
1053
+ def iterate_items(
1054
+ self,
1055
+ root: Optional[NodeItem] = None,
1056
+ with_groups: bool = False,
1057
+ traverse_pictures: bool = True,
1058
+ page_no: Optional[int] = None,
1059
+ _level: int = 0, # fixed parameter, carries through the node nesting level
1060
+ ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
1061
+ """iterate_elements.
1062
+
1063
+ :param root: Optional[NodeItem]: (Default value = None)
1064
+ :param with_groups: bool: (Default value = False)
1065
+ :param traverse_pictures: bool: (Default value = True)
1066
+ :param page_no: Optional[int]: (Default value = None)
1067
+ :param _level: (Default value = 0)
1068
+ :param # fixed parameter:
1069
+ :param carries through the node nesting level:
1070
+ """
1071
+ if not root:
1072
+ root = self.body
1073
+
1074
+ if not isinstance(root, GroupItem) or with_groups:
1075
+ if isinstance(root, DocItem):
1076
+ if page_no is not None:
1077
+ for prov in root.prov:
1078
+ if prov.page_no == page_no:
1079
+ yield root, _level
1080
+ else:
1081
+ yield root, _level
1082
+ else:
1083
+ yield root, _level
1084
+
1085
+ # Traverse children
1086
+ for child_ref in root.children:
1087
+ child = child_ref.resolve(self)
1088
+
1089
+ if isinstance(child, NodeItem):
1090
+ # If the child is a NodeItem, recursively traverse it
1091
+ if not isinstance(child, PictureItem) or traverse_pictures:
1092
+ yield from self.iterate_items(
1093
+ child, _level=_level + 1, with_groups=with_groups
1094
+ )
1095
+
1096
+ def print_element_tree(self):
1097
+ """print_element_tree."""
1098
+ for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
1099
+ if isinstance(item, GroupItem):
1100
+ print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
1101
+ elif isinstance(item, DocItem):
1102
+ print(" " * level, f"{ix}: {item.label.value}")
1103
+
1104
+ def export_to_dict(self) -> Dict:
1105
+ """export_to_dict."""
1106
+ return self.model_dump(mode="json", by_alias=True, exclude_none=True)
436
1107
 
437
1108
  def export_to_markdown( # noqa: C901
438
1109
  self,
439
1110
  delim: str = "\n\n",
440
- main_text_start: int = 0,
441
- main_text_stop: Optional[int] = None,
442
- main_text_labels: list[str] = [
443
- "title",
444
- "subtitle-level-1",
445
- "paragraph",
446
- "caption",
447
- "table",
448
- "figure",
449
- ],
1111
+ from_element: int = 0,
1112
+ to_element: Optional[int] = None,
1113
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
450
1114
  strict_text: bool = False,
451
1115
  image_placeholder: str = "<!-- image -->",
452
1116
  ) -> str:
@@ -455,66 +1119,77 @@ class ExportedCCSDocument(
455
1119
  Operates on a slice of the document's main_text as defined through arguments
456
1120
  main_text_start and main_text_stop; defaulting to the whole main_text.
457
1121
 
458
- Args:
459
- delim (str, optional): Delimiter to use when concatenating the various
1122
+ :param delim: Delimiter to use when concatenating the various
460
1123
  Markdown parts. Defaults to "\n\n".
461
- main_text_start (int, optional): Main-text slicing start index (inclusive).
1124
+ :type delim: str
1125
+ :param from_element: Body slicing start index (inclusive).
462
1126
  Defaults to 0.
463
- main_text_end (Optional[int], optional): Main-text slicing stop index
1127
+ :type from_element: int
1128
+ :param to_element: Body slicing stop index
464
1129
  (exclusive). Defaults to None.
465
- main_text_labels (list[str], optional): The labels to include in the
466
- markdown.
467
- strict_text (bool, optional): if true, the output will be only plain text
468
- without any markdown styling. Defaults to False.
469
- image_placeholder (str, optional): the placeholder to include to position
470
- images in the markdown. Defaults to a markdown comment "<!-- image -->".
471
-
472
- Returns:
473
- str: The exported Markdown representation.
1130
+ :type to_element: Optional[int]
1131
+ :param delim: str: (Default value = "\n\n")
1132
+ :param from_element: int: (Default value = 0)
1133
+ :param to_element: Optional[int]: (Default value = None)
1134
+ :param labels: set[DocItemLabel]
1135
+ :param "subtitle-level-1":
1136
+ :param "paragraph":
1137
+ :param "caption":
1138
+ :param "table":
1139
+ :param "Text":
1140
+ :param "text":
1141
+ :param ]:
1142
+ :param strict_text: bool: (Default value = False)
1143
+ :param image_placeholder str: (Default value = "<!-- image -->")
1144
+ the placeholder to include to position images in the markdown.
1145
+ :returns: The exported Markdown representation.
1146
+ :rtype: str
474
1147
  """
475
1148
  has_title = False
476
1149
  prev_text = ""
477
1150
  md_texts: list[str] = []
478
1151
 
479
- if self.main_text is not None:
480
- # collect all captions embedded in table and figure objects
481
- # to avoid repeating them
482
- embedded_captions = set()
483
- for orig_item in self.main_text[main_text_start:main_text_stop]:
484
- item = (
485
- self._resolve_ref(orig_item)
486
- if isinstance(orig_item, Ref)
487
- else orig_item
488
- )
489
- if item is None:
490
- continue
491
-
492
- if (
493
- isinstance(item, (Table, Figure))
494
- and item.text
495
- and item.obj_type in main_text_labels
496
- ):
497
- embedded_captions.add(item.text)
498
-
499
- # serialize document to markdown
500
- for orig_item in self.main_text[main_text_start:main_text_stop]:
501
- markdown_text = ""
502
-
503
- item = (
504
- self._resolve_ref(orig_item)
505
- if isinstance(orig_item, Ref)
506
- else orig_item
507
- )
508
- if item is None:
509
- continue
510
-
511
- item_type = item.obj_type
512
- if isinstance(item, BaseText) and item_type in main_text_labels:
1152
+ # collect all captions embedded in table and figure objects
1153
+ # to avoid repeating them
1154
+ embedded_captions = set()
1155
+ skip_count = 0
1156
+ for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1157
+ if skip_count < from_element:
1158
+ skip_count += 1
1159
+ continue # skip as many items as you want
1160
+
1161
+ if to_element and ix >= to_element:
1162
+ break
1163
+
1164
+ if (
1165
+ isinstance(item, (TableItem, PictureItem))
1166
+ and len(item.captions) > 0
1167
+ and item.label in labels
1168
+ ):
1169
+ caption = item.caption_text(self)
1170
+ if caption:
1171
+ embedded_captions.add(caption)
1172
+
1173
+ skip_count = 0
1174
+ for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1175
+ if skip_count < from_element:
1176
+ skip_count += 1
1177
+ continue # skip as many items as you want
1178
+
1179
+ if to_element and ix >= to_element:
1180
+ break
1181
+
1182
+ markdown_text = ""
1183
+
1184
+ if isinstance(item, DocItem):
1185
+ item_type = item.label
1186
+
1187
+ if isinstance(item, TextItem) and item_type in labels:
513
1188
  text = item.text
514
1189
 
515
1190
  # skip captions of they are embedded in the actual
516
1191
  # floating object
517
- if item_type == "caption" and text in embedded_captions:
1192
+ if item_type == DocItemLabel.CAPTION and text in embedded_captions:
518
1193
  continue
519
1194
 
520
1195
  # ignore repeated text
@@ -524,7 +1199,7 @@ class ExportedCCSDocument(
524
1199
  prev_text = text
525
1200
 
526
1201
  # first title match
527
- if item_type == "title" and not has_title:
1202
+ if item_type == DocItemLabel.TITLE and not has_title:
528
1203
  if strict_text:
529
1204
  markdown_text = f"{text}"
530
1205
  else:
@@ -532,78 +1207,89 @@ class ExportedCCSDocument(
532
1207
  has_title = True
533
1208
 
534
1209
  # secondary titles
535
- elif item_type in {"title", "subtitle-level-1"} or (
536
- has_title and item_type == "title"
537
- ):
1210
+ elif item_type in {
1211
+ DocItemLabel.TITLE,
1212
+ DocItemLabel.SECTION_HEADER,
1213
+ } or (has_title and item_type == DocItemLabel.TITLE):
538
1214
  if strict_text:
539
1215
  markdown_text = f"{text}"
540
1216
  else:
541
1217
  markdown_text = f"## {text}"
542
1218
 
1219
+ # secondary titles
1220
+ elif isinstance(item, ListItem):
1221
+ if item.enumerated:
1222
+ marker = item.marker
1223
+ else:
1224
+ marker = "-"
1225
+
1226
+ markdown_text = f"{marker} {text}"
1227
+
543
1228
  # normal text
544
1229
  else:
545
1230
  markdown_text = text
546
1231
 
547
- elif (
548
- isinstance(item, Table)
549
- and item.data
550
- and item_type in main_text_labels
551
- ):
1232
+ elif isinstance(item, TableItem) and item.data and item_type in labels:
1233
+ parts = []
552
1234
 
553
- md_table = ""
554
- table = []
555
- for row in item.data:
556
- tmp = []
557
- for col in row:
558
- tmp.append(col.text)
559
- table.append(tmp)
560
-
561
- if len(table) > 1 and len(table[0]) > 0:
562
- try:
563
- md_table = tabulate(
564
- table[1:], headers=table[0], tablefmt="github"
565
- )
566
- except ValueError:
567
- md_table = tabulate(
568
- table[1:],
569
- headers=table[0],
570
- tablefmt="github",
571
- disable_numparse=True,
572
- )
573
-
574
- markdown_text = ""
575
- if item.text:
576
- markdown_text = item.text
1235
+ # Compute the caption
1236
+ if caption := item.caption_text(self):
1237
+ parts.append(caption)
1238
+ parts.append("\n")
1239
+
1240
+ # Rendered the item
577
1241
  if not strict_text:
578
- markdown_text += "\n" + md_table
1242
+ md_table = item.export_to_markdown()
1243
+ if md_table:
1244
+ parts.append(item.export_to_markdown())
1245
+
1246
+ # Combine parts
1247
+ markdown_text = "\n".join(parts)
579
1248
 
580
- elif isinstance(item, Figure) and item_type in main_text_labels:
1249
+ elif isinstance(item, PictureItem) and item_type in labels:
1250
+ parts = []
581
1251
 
582
- markdown_text = ""
583
- if item.text:
584
- markdown_text = item.text
1252
+ # Compute the caption
1253
+ if caption := item.caption_text(self):
1254
+ parts.append(caption)
1255
+ parts.append("\n")
1256
+
1257
+ # Rendered the item
585
1258
  if not strict_text:
586
- markdown_text += f"\n{image_placeholder}"
1259
+ parts.append(f"{image_placeholder}")
1260
+
1261
+ # Combine parts
1262
+ markdown_text = "\n".join(parts)
587
1263
 
588
- if markdown_text:
589
- md_texts.append(markdown_text)
1264
+ if markdown_text:
1265
+ md_texts.append(markdown_text)
590
1266
 
591
1267
  result = delim.join(md_texts)
592
1268
  return result
593
1269
 
1270
+ def export_to_text( # noqa: C901
1271
+ self,
1272
+ delim: str = "\n\n",
1273
+ from_element: int = 0,
1274
+ to_element: Optional[int] = None,
1275
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1276
+ ) -> str:
1277
+ """export_to_text."""
1278
+ return self.export_to_markdown(
1279
+ delim,
1280
+ from_element,
1281
+ to_element,
1282
+ labels,
1283
+ strict_text=True,
1284
+ image_placeholder="",
1285
+ )
1286
+
594
1287
  def export_to_document_tokens(
595
1288
  self,
596
1289
  delim: str = "\n\n",
597
- main_text_start: int = 0,
598
- main_text_stop: Optional[int] = None,
599
- main_text_labels: list[str] = [
600
- "title",
601
- "subtitle-level-1",
602
- "paragraph",
603
- "caption",
604
- "table",
605
- "figure",
606
- ],
1290
+ from_element: int = 0,
1291
+ to_element: Optional[int] = None,
1292
+ labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
607
1293
  xsize: int = 100,
608
1294
  ysize: int = 100,
609
1295
  add_location: bool = True,
@@ -616,11 +1302,23 @@ class ExportedCCSDocument(
616
1302
  ) -> str:
617
1303
  r"""Exports the document content to an DocumentToken format.
618
1304
 
619
- Operates on a slice of the document's main_text as defined through arguments
620
- main_text_start and main_text_stop; defaulting to the whole main_text.
621
-
622
- Returns:
623
- str: The content of the document formatted as a DocTags string.
1305
+ Operates on a slice of the document's body as defined through arguments
1306
+ from_element and to_element; defaulting to the whole main_text.
1307
+
1308
+ :param delim: str: (Default value = "\n\n")
1309
+ :param from_element: int: (Default value = 0)
1310
+ :param to_element: Optional[int]: (Default value = None)
1311
+ :param labels: set[DocItemLabel]
1312
+ :param xsize: int: (Default value = 100)
1313
+ :param ysize: int: (Default value = 100)
1314
+ :param add_location: bool: (Default value = True)
1315
+ :param add_content: bool: (Default value = True)
1316
+ :param add_page_index: bool: (Default value = True)
1317
+ :param # table specific flagsadd_table_cell_location: bool
1318
+ :param add_table_cell_label: bool: (Default value = True)
1319
+ :param add_table_cell_text: bool: (Default value = True)
1320
+ :returns: The content of the document formatted as a DocTags string.
1321
+ :rtype: str
624
1322
  """
625
1323
  new_line = ""
626
1324
  if delim:
@@ -630,82 +1328,113 @@ class ExportedCCSDocument(
630
1328
 
631
1329
  # pagedims = self.get_map_to_page_dimensions()
632
1330
 
633
- if self.main_text is not None:
634
- for orig_item in self.main_text[main_text_start:main_text_stop]:
1331
+ skip_count = 0
1332
+ for ix, (item, level) in enumerate(self.iterate_items(self.body)):
1333
+ if skip_count < from_element:
1334
+ skip_count += 1
1335
+ continue # skip as many items as you want
635
1336
 
636
- item = (
637
- self._resolve_ref(orig_item)
638
- if isinstance(orig_item, Ref)
639
- else orig_item
640
- )
1337
+ if to_element and ix >= to_element:
1338
+ break
641
1339
 
642
- if item is None:
643
- continue
1340
+ if not isinstance(item, DocItem):
1341
+ continue
644
1342
 
645
- prov = item.prov
1343
+ prov = item.prov
646
1344
 
647
- page_i = -1
648
- page_w = 0.0
649
- page_h = 0.0
1345
+ page_i = -1
650
1346
 
651
- if (
652
- add_location
653
- and self.page_dimensions is not None
654
- and prov is not None
655
- and len(prov) > 0
656
- ):
1347
+ if add_location and len(self.pages) and len(prov) > 0:
657
1348
 
658
- page_i = prov[0].page
659
- page_dim = self.page_dimensions[page_i - 1]
1349
+ page_i = prov[0].page_no
1350
+ page_dim = self.pages[page_i].size
660
1351
 
661
- page_w = float(page_dim.width)
662
- page_h = float(page_dim.height)
1352
+ float(page_dim.width)
1353
+ float(page_dim.height)
663
1354
 
664
- item_type = item.obj_type
665
- if isinstance(item, BaseText) and (item_type in main_text_labels):
1355
+ item_type = item.label
1356
+ if isinstance(item, TextItem) and (item_type in labels):
666
1357
 
667
- doctags += item.export_to_document_tokens(
668
- new_line=new_line,
669
- page_w=page_w,
670
- page_h=page_h,
671
- xsize=xsize,
672
- ysize=ysize,
673
- add_location=add_location,
674
- add_content=add_content,
675
- add_page_index=add_page_index,
676
- )
1358
+ doctags += item.export_to_document_tokens(
1359
+ doc=self,
1360
+ new_line=new_line,
1361
+ xsize=xsize,
1362
+ ysize=ysize,
1363
+ add_location=add_location,
1364
+ add_content=add_content,
1365
+ add_page_index=add_page_index,
1366
+ )
677
1367
 
678
- elif isinstance(item, Table) and (item_type in main_text_labels):
679
-
680
- doctags += item.export_to_document_tokens(
681
- new_line=new_line,
682
- page_w=page_w,
683
- page_h=page_h,
684
- xsize=xsize,
685
- ysize=ysize,
686
- add_caption=True,
687
- add_location=add_location,
688
- add_content=add_content,
689
- add_cell_location=add_table_cell_location,
690
- add_cell_label=add_table_cell_label,
691
- add_cell_text=add_table_cell_text,
692
- add_page_index=add_page_index,
693
- )
1368
+ elif isinstance(item, TableItem) and (item_type in labels):
1369
+
1370
+ doctags += item.export_to_document_tokens(
1371
+ doc=self,
1372
+ new_line=new_line,
1373
+ xsize=xsize,
1374
+ ysize=ysize,
1375
+ add_caption=True,
1376
+ add_location=add_location,
1377
+ add_content=add_content,
1378
+ add_cell_location=add_table_cell_location,
1379
+ add_cell_label=add_table_cell_label,
1380
+ add_cell_text=add_table_cell_text,
1381
+ add_page_index=add_page_index,
1382
+ )
694
1383
 
695
- elif isinstance(item, Figure) and (item_type in main_text_labels):
696
-
697
- doctags += item.export_to_document_tokens(
698
- new_line=new_line,
699
- page_w=page_w,
700
- page_h=page_h,
701
- xsize=xsize,
702
- ysize=ysize,
703
- add_caption=True,
704
- add_location=add_location,
705
- add_content=add_content,
706
- add_page_index=add_page_index,
707
- )
1384
+ elif isinstance(item, PictureItem) and (item_type in labels):
1385
+
1386
+ doctags += item.export_to_document_tokens(
1387
+ doc=self,
1388
+ new_line=new_line,
1389
+ xsize=xsize,
1390
+ ysize=ysize,
1391
+ add_caption=True,
1392
+ add_location=add_location,
1393
+ add_content=add_content,
1394
+ add_page_index=add_page_index,
1395
+ )
708
1396
 
709
1397
  doctags += DocumentToken.END_DOCUMENT.value
710
1398
 
711
1399
  return doctags
1400
+
1401
+ def add_page(
1402
+ self, page_no: int, size: Size, image: Optional[ImageRef] = None
1403
+ ) -> PageItem:
1404
+ """add_page.
1405
+
1406
+ :param page_no: int:
1407
+ :param size: Size:
1408
+
1409
+ """
1410
+ pitem = PageItem(page_no=page_no, size=size, image=image)
1411
+
1412
+ self.pages[page_no] = pitem
1413
+ return pitem
1414
+
1415
+ @field_validator("version")
1416
+ @classmethod
1417
+ def check_version_is_compatible(cls, v: str) -> str:
1418
+ """Check if this document version is compatible with current version."""
1419
+ current_match = re.match(VERSION_PATTERN, CURRENT_VERSION)
1420
+ doc_match = re.match(VERSION_PATTERN, v)
1421
+ if (
1422
+ doc_match is None
1423
+ or current_match is None
1424
+ or doc_match["major"] != current_match["major"]
1425
+ or doc_match["minor"] > current_match["minor"]
1426
+ ):
1427
+ raise ValueError(
1428
+ f"incompatible version {v} with schema version {CURRENT_VERSION}"
1429
+ )
1430
+ else:
1431
+ return CURRENT_VERSION
1432
+
1433
+ @model_validator(mode="after") # type: ignore
1434
+ @classmethod
1435
+ def validate_document(cls, d: "DoclingDocument"):
1436
+ """validate_document."""
1437
+ if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
1438
+ raise ValueError("Document hierachy is inconsistent.")
1439
+
1440
+ return d