docling-core 1.7.2__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/__init__.py +2 -8
- docling_core/transforms/chunker/base.py +27 -40
- docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
- docling_core/types/__init__.py +3 -18
- docling_core/types/doc/__init__.py +25 -0
- docling_core/types/doc/base.py +136 -451
- docling_core/types/doc/document.py +1289 -559
- docling_core/types/{experimental → doc}/labels.py +4 -1
- docling_core/types/legacy_doc/__init__.py +6 -0
- docling_core/types/legacy_doc/base.py +485 -0
- docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
- docling_core/types/legacy_doc/document.py +715 -0
- docling_core/types/rec/subject.py +1 -1
- docling_core/utils/generate_docs.py +82 -0
- docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
- docling_core/utils/validators.py +3 -3
- {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/METADATA +17 -17
- {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/RECORD +24 -31
- docling_core-2.0.1.dist-info/entry_points.txt +5 -0
- docling_core/transforms/id_generator/__init__.py +0 -12
- docling_core/transforms/id_generator/base.py +0 -30
- docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
- docling_core/transforms/id_generator/uuid_generator.py +0 -34
- docling_core/transforms/metadata_extractor/__init__.py +0 -13
- docling_core/transforms/metadata_extractor/base.py +0 -59
- docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
- docling_core/types/experimental/__init__.py +0 -30
- docling_core/types/experimental/base.py +0 -167
- docling_core/types/experimental/document.py +0 -1192
- docling_core/utils/ds_generate_docs.py +0 -144
- docling_core-1.7.2.dist-info/entry_points.txt +0 -5
- /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
- {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/LICENSE +0 -0
- {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/WHEEL +0 -0
|
@@ -1,452 +1,1117 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
1
|
"""Models for the Docling Document data type."""
|
|
7
2
|
|
|
8
|
-
|
|
9
|
-
|
|
3
|
+
import base64
|
|
4
|
+
import mimetypes
|
|
5
|
+
import re
|
|
6
|
+
import typing
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
10
9
|
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from PIL import Image as PILImage
|
|
11
12
|
from pydantic import (
|
|
12
|
-
|
|
13
|
+
AnyUrl,
|
|
13
14
|
BaseModel,
|
|
15
|
+
ConfigDict,
|
|
14
16
|
Field,
|
|
15
|
-
|
|
16
|
-
|
|
17
|
+
StringConstraints,
|
|
18
|
+
computed_field,
|
|
19
|
+
field_validator,
|
|
17
20
|
model_validator,
|
|
18
21
|
)
|
|
19
22
|
from tabulate import tabulate
|
|
23
|
+
from typing_extensions import Annotated, Self
|
|
20
24
|
|
|
21
|
-
from docling_core.search.
|
|
22
|
-
from docling_core.types.base import
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
DescriptionAdvancedT,
|
|
27
|
-
DescriptionAnalyticsT,
|
|
28
|
-
FileInfoObject,
|
|
29
|
-
Identifier,
|
|
30
|
-
IdentifierTypeT,
|
|
31
|
-
LanguageT,
|
|
32
|
-
Log,
|
|
33
|
-
)
|
|
34
|
-
from docling_core.types.doc.base import (
|
|
35
|
-
BaseCell,
|
|
36
|
-
BaseText,
|
|
37
|
-
BitmapObject,
|
|
38
|
-
Figure,
|
|
39
|
-
PageDimensions,
|
|
40
|
-
PageReference,
|
|
41
|
-
Ref,
|
|
42
|
-
S3Data,
|
|
43
|
-
Table,
|
|
44
|
-
)
|
|
45
|
-
from docling_core.types.doc.tokens import DocumentToken
|
|
46
|
-
from docling_core.utils.alias import AliasModel
|
|
25
|
+
from docling_core.search.package import VERSION_PATTERN
|
|
26
|
+
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
27
|
+
from docling_core.types.doc import BoundingBox, Size
|
|
28
|
+
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
|
|
29
|
+
from docling_core.types.legacy_doc.tokens import DocumentToken
|
|
47
30
|
|
|
31
|
+
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
32
|
+
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
33
|
+
CURRENT_VERSION: Final = "1.0.0"
|
|
48
34
|
|
|
49
|
-
|
|
50
|
-
|
|
35
|
+
DEFAULT_EXPORT_LABELS = {
|
|
36
|
+
DocItemLabel.TITLE,
|
|
37
|
+
DocItemLabel.DOCUMENT_INDEX,
|
|
38
|
+
DocItemLabel.SECTION_HEADER,
|
|
39
|
+
DocItemLabel.PARAGRAPH,
|
|
40
|
+
DocItemLabel.CAPTION,
|
|
41
|
+
DocItemLabel.TABLE,
|
|
42
|
+
DocItemLabel.PICTURE,
|
|
43
|
+
DocItemLabel.FORMULA,
|
|
44
|
+
DocItemLabel.CHECKBOX_UNSELECTED,
|
|
45
|
+
DocItemLabel.CHECKBOX_SELECTED,
|
|
46
|
+
DocItemLabel.TEXT,
|
|
47
|
+
DocItemLabel.LIST_ITEM,
|
|
48
|
+
DocItemLabel.CODE,
|
|
49
|
+
}
|
|
51
50
|
|
|
52
|
-
author: Optional[list[StrictStr]] = None
|
|
53
|
-
keywords: Optional[str] = None
|
|
54
|
-
subject: Optional[str] = None
|
|
55
|
-
title: Optional[StrictStr] = None
|
|
56
|
-
creation_date: Optional[str] = None # datetime
|
|
57
51
|
|
|
52
|
+
class BasePictureData(BaseModel):
|
|
53
|
+
"""BasePictureData."""
|
|
58
54
|
|
|
59
|
-
|
|
60
|
-
"""File info object."""
|
|
55
|
+
kind: str
|
|
61
56
|
|
|
62
|
-
num_pages: Optional[int] = Field(default=None, alias="#-pages")
|
|
63
57
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
alias="collection-name",
|
|
67
|
-
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
68
|
-
)
|
|
69
|
-
description: Optional[CCSFileInfoDescription] = Field(
|
|
70
|
-
default=None, json_schema_extra=es_field(suppress=True)
|
|
71
|
-
)
|
|
72
|
-
page_hashes: Optional[list[PageReference]] = Field(
|
|
73
|
-
default=None, alias="page-hashes"
|
|
74
|
-
)
|
|
58
|
+
class PictureClassificationClass(BaseModel):
|
|
59
|
+
"""PictureClassificationData."""
|
|
75
60
|
|
|
61
|
+
class_name: str
|
|
62
|
+
confidence: float
|
|
76
63
|
|
|
77
|
-
class Affiliation(BaseModel, extra="forbid"):
|
|
78
|
-
"""Affiliation."""
|
|
79
|
-
|
|
80
|
-
name: str = Field(
|
|
81
|
-
...,
|
|
82
|
-
json_schema_extra=es_field(
|
|
83
|
-
fields={
|
|
84
|
-
"lower": {
|
|
85
|
-
"normalizer": "lowercase_asciifolding",
|
|
86
|
-
"type": "keyword",
|
|
87
|
-
"ignore_above": 8191,
|
|
88
|
-
},
|
|
89
|
-
"keyword": {"type": "keyword", "ignore_above": 8191},
|
|
90
|
-
},
|
|
91
|
-
),
|
|
92
|
-
)
|
|
93
|
-
id: Optional[str] = Field(
|
|
94
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
95
|
-
)
|
|
96
|
-
source: Optional[str] = Field(
|
|
97
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
98
|
-
)
|
|
99
64
|
|
|
65
|
+
class PictureClassificationData(BasePictureData):
|
|
66
|
+
"""PictureClassificationData."""
|
|
100
67
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
name: str = Field(
|
|
105
|
-
...,
|
|
106
|
-
json_schema_extra=es_field(
|
|
107
|
-
type="text",
|
|
108
|
-
fields={
|
|
109
|
-
"lower": {
|
|
110
|
-
"normalizer": "lowercase_asciifolding",
|
|
111
|
-
"type": "keyword",
|
|
112
|
-
"ignore_above": 8191,
|
|
113
|
-
},
|
|
114
|
-
"keyword": {"type": "keyword", "ignore_above": 8191},
|
|
115
|
-
},
|
|
116
|
-
),
|
|
117
|
-
)
|
|
118
|
-
id: Optional[str] = Field(
|
|
119
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
120
|
-
)
|
|
121
|
-
source: Optional[str] = Field(
|
|
122
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
123
|
-
)
|
|
124
|
-
affiliations: Optional[list[Affiliation]] = None
|
|
68
|
+
kind: Literal["classification"] = "classification"
|
|
69
|
+
provenance: str
|
|
70
|
+
predicted_classes: List[PictureClassificationClass]
|
|
125
71
|
|
|
126
72
|
|
|
127
|
-
class
|
|
128
|
-
"""
|
|
73
|
+
class PictureDescriptionData(BasePictureData):
|
|
74
|
+
"""PictureDescriptionData."""
|
|
129
75
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
)
|
|
134
|
-
name: StrictStr = Field(
|
|
135
|
-
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
136
|
-
description="Name of the publication.",
|
|
137
|
-
)
|
|
138
|
-
alternate_names: Optional[list[StrictStr]] = Field(
|
|
139
|
-
default=None,
|
|
140
|
-
json_schema_extra=es_field(type="text"),
|
|
141
|
-
title="Alternate Names",
|
|
142
|
-
description="Other names or abbreviations of this publication.",
|
|
143
|
-
)
|
|
144
|
-
type: Optional[list[StrictStr]] = Field(
|
|
145
|
-
default=None,
|
|
146
|
-
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
147
|
-
description="Type of publication (journal article, conference, review,...).",
|
|
148
|
-
)
|
|
149
|
-
pages: Optional[StrictStr] = Field(
|
|
150
|
-
default=None,
|
|
151
|
-
json_schema_extra=es_field(type="text"),
|
|
152
|
-
description="Page range in the publication.",
|
|
153
|
-
)
|
|
154
|
-
issue: Optional[StrictStr] = Field(
|
|
155
|
-
default=None,
|
|
156
|
-
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
157
|
-
description="Publication issue (issue number).",
|
|
158
|
-
)
|
|
159
|
-
volume: Optional[StrictStr] = Field(
|
|
160
|
-
default=None,
|
|
161
|
-
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
162
|
-
description="Publication volume.",
|
|
163
|
-
)
|
|
164
|
-
url: Optional[AnyHttpUrl] = Field(
|
|
165
|
-
default=None,
|
|
166
|
-
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
167
|
-
description="URL on the publication site.",
|
|
168
|
-
)
|
|
76
|
+
kind: Literal["description"] = "description"
|
|
77
|
+
text: str
|
|
78
|
+
provenance: str
|
|
169
79
|
|
|
170
80
|
|
|
171
|
-
class
|
|
172
|
-
"""
|
|
81
|
+
class PictureMoleculeData(BaseModel):
|
|
82
|
+
"""PictureMoleculeData."""
|
|
173
83
|
|
|
174
|
-
|
|
175
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
176
|
-
)
|
|
177
|
-
text: Optional[StrictStr] = None
|
|
84
|
+
kind: Literal["molecule_data"] = "molecule_data"
|
|
178
85
|
|
|
86
|
+
smi: str
|
|
87
|
+
confidence: float
|
|
88
|
+
class_name: str
|
|
89
|
+
segmentation: List[Tuple[float, float]]
|
|
90
|
+
provenance: str
|
|
179
91
|
|
|
180
|
-
class CCSDocumentDescription(
|
|
181
|
-
AliasModel,
|
|
182
|
-
Generic[
|
|
183
|
-
DescriptionAdvancedT,
|
|
184
|
-
DescriptionAnalyticsT,
|
|
185
|
-
IdentifierTypeT,
|
|
186
|
-
LanguageT,
|
|
187
|
-
CollectionNameTypeT,
|
|
188
|
-
],
|
|
189
|
-
):
|
|
190
|
-
"""Description in document."""
|
|
191
|
-
|
|
192
|
-
title: Optional[StrictStr] = None
|
|
193
|
-
abstract: Optional[list[StrictStr]] = None
|
|
194
|
-
authors: Optional[list[Author]] = None
|
|
195
|
-
affiliations: Optional[list[Affiliation]] = None
|
|
196
|
-
subjects: Optional[list[str]] = Field(
|
|
197
|
-
default=None,
|
|
198
|
-
json_schema_extra=es_field(
|
|
199
|
-
fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
|
|
200
|
-
),
|
|
201
|
-
)
|
|
202
|
-
keywords: Optional[list[str]] = Field(
|
|
203
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
204
|
-
)
|
|
205
|
-
publication_date: Optional[datetime] = None
|
|
206
|
-
languages: Optional[list[LanguageT]] = Field(
|
|
207
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
208
|
-
)
|
|
209
|
-
license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
|
|
210
|
-
publishers: Optional[list[StrictStr]] = Field(
|
|
211
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
212
|
-
)
|
|
213
|
-
url_refs: Optional[list[str]] = Field(
|
|
214
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
215
|
-
)
|
|
216
|
-
references: Optional[list[Identifier[IdentifierTypeT]]] = None
|
|
217
|
-
publication: Optional[list[Publication]] = Field(
|
|
218
|
-
default=None, description="List of publication journals or venues."
|
|
219
|
-
)
|
|
220
|
-
reference_count: Optional[NonNegativeInt] = Field(
|
|
221
|
-
default=None,
|
|
222
|
-
title="Reference Count",
|
|
223
|
-
description="Total number of documents referenced by this document.",
|
|
224
|
-
json_schema_extra=es_field(type="integer"),
|
|
225
|
-
)
|
|
226
|
-
citation_count: Optional[NonNegativeInt] = Field(
|
|
227
|
-
default=None,
|
|
228
|
-
title="Citation Count",
|
|
229
|
-
description=(
|
|
230
|
-
"Total number of citations that this document has received (number "
|
|
231
|
-
"of documents in whose bibliography this document appears)."
|
|
232
|
-
),
|
|
233
|
-
json_schema_extra=es_field(type="integer"),
|
|
234
|
-
)
|
|
235
|
-
citation_date: Optional[datetime] = Field(
|
|
236
|
-
default=None,
|
|
237
|
-
title="Citation Count Date",
|
|
238
|
-
description="Last update date of the citation count.",
|
|
239
|
-
)
|
|
240
|
-
advanced: Optional[DescriptionAdvancedT] = None
|
|
241
|
-
analytics: Optional[DescriptionAnalyticsT] = None
|
|
242
|
-
logs: list[Log]
|
|
243
|
-
collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
|
|
244
|
-
default=None, description="The collection information of this document."
|
|
245
|
-
)
|
|
246
|
-
acquisition: Optional[Acquisition] = Field(
|
|
247
|
-
default=None,
|
|
248
|
-
description=(
|
|
249
|
-
"Information on how the document was obtained, for data governance"
|
|
250
|
-
" purposes."
|
|
251
|
-
),
|
|
252
|
-
)
|
|
253
92
|
|
|
93
|
+
class PictureMiscData(BaseModel):
|
|
94
|
+
"""PictureMiscData."""
|
|
254
95
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
"""Minimal model for a document."""
|
|
266
|
-
|
|
267
|
-
name: StrictStr = Field(alias="_name")
|
|
268
|
-
obj_type: Optional[StrictStr] = Field("document", alias="type")
|
|
269
|
-
description: CCSDocumentDescription[
|
|
270
|
-
DescriptionAdvancedT,
|
|
271
|
-
DescriptionAnalyticsT,
|
|
272
|
-
IdentifierTypeT,
|
|
273
|
-
LanguageT,
|
|
274
|
-
CollectionNameTypeT,
|
|
275
|
-
]
|
|
276
|
-
file_info: FileInfoObject = Field(alias="file-info")
|
|
277
|
-
main_text: Optional[list[Union[Ref, BaseText]]] = Field(
|
|
278
|
-
default=None, alias="main-text"
|
|
279
|
-
)
|
|
280
|
-
figures: Optional[list[Figure]] = None
|
|
281
|
-
tables: Optional[list[Table]] = None
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
class CCSDocument(
|
|
285
|
-
MinimalDocument,
|
|
286
|
-
Generic[
|
|
287
|
-
DescriptionAdvancedT,
|
|
288
|
-
DescriptionAnalyticsT,
|
|
289
|
-
IdentifierTypeT,
|
|
290
|
-
LanguageT,
|
|
291
|
-
CollectionNameTypeT,
|
|
96
|
+
kind: Literal["misc"] = "misc"
|
|
97
|
+
content: Dict[str, Any]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
PictureDataType = Annotated[
|
|
101
|
+
Union[
|
|
102
|
+
PictureClassificationData,
|
|
103
|
+
PictureDescriptionData,
|
|
104
|
+
PictureMoleculeData,
|
|
105
|
+
PictureMiscData,
|
|
292
106
|
],
|
|
293
|
-
)
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
107
|
+
Field(discriminator="kind"),
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class TableCell(BaseModel):
|
|
112
|
+
"""TableCell."""
|
|
113
|
+
|
|
114
|
+
bbox: Optional[BoundingBox] = None
|
|
115
|
+
row_span: int = 1
|
|
116
|
+
col_span: int = 1
|
|
117
|
+
start_row_offset_idx: int
|
|
118
|
+
end_row_offset_idx: int
|
|
119
|
+
start_col_offset_idx: int
|
|
120
|
+
end_col_offset_idx: int
|
|
121
|
+
text: str
|
|
122
|
+
column_header: bool = False
|
|
123
|
+
row_header: bool = False
|
|
124
|
+
row_section: bool = False
|
|
311
125
|
|
|
312
126
|
@model_validator(mode="before")
|
|
313
127
|
@classmethod
|
|
314
|
-
def
|
|
315
|
-
"""
|
|
316
|
-
if
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
128
|
+
def from_dict_format(cls, data: Any) -> Any:
|
|
129
|
+
"""from_dict_format."""
|
|
130
|
+
if isinstance(data, Dict):
|
|
131
|
+
# Check if this is a native BoundingBox or a bbox from docling-ibm-models
|
|
132
|
+
if (
|
|
133
|
+
# "bbox" not in data
|
|
134
|
+
# or data["bbox"] is None
|
|
135
|
+
# or isinstance(data["bbox"], BoundingBox)
|
|
136
|
+
"text"
|
|
137
|
+
in data
|
|
138
|
+
):
|
|
139
|
+
return data
|
|
140
|
+
text = data["bbox"].get("token", "")
|
|
141
|
+
if not len(text):
|
|
142
|
+
text_cells = data.pop("text_cell_bboxes", None)
|
|
143
|
+
if text_cells:
|
|
144
|
+
for el in text_cells:
|
|
145
|
+
text += el["token"] + " "
|
|
146
|
+
|
|
147
|
+
text = text.strip()
|
|
148
|
+
data["text"] = text
|
|
333
149
|
|
|
334
|
-
|
|
335
|
-
descr = data["description"].get(key)
|
|
336
|
-
if descr is not None and not isinstance(descr, list):
|
|
337
|
-
if isinstance(descr, dict):
|
|
338
|
-
data["description"][key] = [descr]
|
|
339
|
-
else:
|
|
340
|
-
data["description"].pop(key)
|
|
150
|
+
return data
|
|
341
151
|
|
|
342
|
-
if data.get("main-text"):
|
|
343
|
-
for item in data["main-text"]:
|
|
344
|
-
if ref := item.pop("__ref", None):
|
|
345
|
-
item["$ref"] = ref
|
|
346
152
|
|
|
347
|
-
|
|
153
|
+
class TableData(BaseModel): # TBD
|
|
154
|
+
"""BaseTableData."""
|
|
348
155
|
|
|
156
|
+
table_cells: List[TableCell] = []
|
|
157
|
+
num_rows: int = 0
|
|
158
|
+
num_cols: int = 0
|
|
349
159
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
160
|
+
@computed_field # type: ignore
|
|
161
|
+
@property
|
|
162
|
+
def grid(
|
|
163
|
+
self,
|
|
164
|
+
) -> List[List[TableCell]]:
|
|
165
|
+
"""grid."""
|
|
166
|
+
# Initialise empty table data grid (only empty cells)
|
|
167
|
+
table_data = [
|
|
168
|
+
[
|
|
169
|
+
TableCell(
|
|
170
|
+
text="",
|
|
171
|
+
start_row_offset_idx=i,
|
|
172
|
+
end_row_offset_idx=i + 1,
|
|
173
|
+
start_col_offset_idx=j,
|
|
174
|
+
end_col_offset_idx=j + 1,
|
|
175
|
+
)
|
|
176
|
+
for j in range(self.num_cols)
|
|
177
|
+
]
|
|
178
|
+
for i in range(self.num_rows)
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
# Overwrite cells in table data for which there is actual cell content.
|
|
182
|
+
for cell in self.table_cells:
|
|
183
|
+
for i in range(
|
|
184
|
+
min(cell.start_row_offset_idx, self.num_rows),
|
|
185
|
+
min(cell.end_row_offset_idx, self.num_rows),
|
|
186
|
+
):
|
|
187
|
+
for j in range(
|
|
188
|
+
min(cell.start_col_offset_idx, self.num_cols),
|
|
189
|
+
min(cell.end_col_offset_idx, self.num_cols),
|
|
190
|
+
):
|
|
191
|
+
table_data[i][j] = cell
|
|
192
|
+
|
|
193
|
+
return table_data
|
|
361
194
|
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
195
|
+
|
|
196
|
+
class DocumentOrigin(BaseModel):
|
|
197
|
+
"""FileSource."""
|
|
198
|
+
|
|
199
|
+
mimetype: str # the mimetype of the original file
|
|
200
|
+
binary_hash: Uint64 # the binary hash of the original file.
|
|
201
|
+
# TODO: Change to be Uint64 and provide utility method to generate
|
|
202
|
+
|
|
203
|
+
filename: str # The name of the original file, including extension, without path.
|
|
204
|
+
# Could stem from filesystem, source URI, Content-Disposition header, ...
|
|
205
|
+
|
|
206
|
+
uri: Optional[AnyUrl] = (
|
|
207
|
+
None # any possible reference to a source file,
|
|
208
|
+
# from any file handler protocol (e.g. https://, file://, s3://)
|
|
366
209
|
)
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
CollectionNameTypeT,
|
|
210
|
+
|
|
211
|
+
_extra_mimetypes: typing.ClassVar[List[str]] = [
|
|
212
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
213
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
|
214
|
+
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
|
215
|
+
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
|
216
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
217
|
+
"text/asciidoc",
|
|
376
218
|
]
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
|
|
219
|
+
|
|
220
|
+
@field_validator("binary_hash", mode="before")
|
|
221
|
+
@classmethod
|
|
222
|
+
def parse_hex_string(cls, value):
|
|
223
|
+
"""parse_hex_string."""
|
|
224
|
+
if isinstance(value, str):
|
|
225
|
+
try:
|
|
226
|
+
# Convert hex string to an integer
|
|
227
|
+
hash_int = Uint64(value, 16)
|
|
228
|
+
# Mask to fit within 64 bits (unsigned)
|
|
229
|
+
return (
|
|
230
|
+
hash_int & 0xFFFFFFFFFFFFFFFF
|
|
231
|
+
) # TODO be sure it doesn't clip uint64 max
|
|
232
|
+
except ValueError:
|
|
233
|
+
raise ValueError(f"Invalid sha256 hexdigest: {value}")
|
|
234
|
+
return value # If already an int, return it as is.
|
|
235
|
+
|
|
236
|
+
@field_validator("mimetype")
|
|
237
|
+
@classmethod
|
|
238
|
+
def validate_mimetype(cls, v):
|
|
239
|
+
"""validate_mimetype."""
|
|
240
|
+
# Check if the provided MIME type is valid using mimetypes module
|
|
241
|
+
if v not in mimetypes.types_map.values() and v not in cls._extra_mimetypes:
|
|
242
|
+
raise ValueError(f"'{v}' is not a valid MIME type")
|
|
243
|
+
return v
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class RefItem(BaseModel):
|
|
247
|
+
"""RefItem."""
|
|
248
|
+
|
|
249
|
+
cref: str = Field(alias="$ref", pattern=_JSON_POINTER_REGEX)
|
|
250
|
+
|
|
251
|
+
# This method makes RefItem compatible with DocItem
|
|
252
|
+
def get_ref(self):
|
|
253
|
+
"""get_ref."""
|
|
254
|
+
return self
|
|
255
|
+
|
|
256
|
+
model_config = ConfigDict(
|
|
257
|
+
populate_by_name=True,
|
|
383
258
|
)
|
|
384
|
-
page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
|
|
385
|
-
page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
|
|
386
|
-
s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
|
|
387
|
-
identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
|
|
388
259
|
|
|
389
|
-
|
|
260
|
+
def resolve(self, doc: "DoclingDocument"):
|
|
261
|
+
"""resolve."""
|
|
262
|
+
path_components = self.cref.split("/")
|
|
263
|
+
if (num_comps := len(path_components)) == 3:
|
|
264
|
+
_, path, index_str = path_components
|
|
265
|
+
index = int(index_str)
|
|
266
|
+
obj = doc.__getattribute__(path)[index]
|
|
267
|
+
elif num_comps == 2:
|
|
268
|
+
_, path = path_components
|
|
269
|
+
obj = doc.__getattribute__(path)
|
|
270
|
+
else:
|
|
271
|
+
raise RuntimeError(f"Unsupported number of path components: {num_comps}")
|
|
272
|
+
return obj
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
class ImageRef(BaseModel):
|
|
276
|
+
"""ImageRef."""
|
|
277
|
+
|
|
278
|
+
mimetype: str
|
|
279
|
+
dpi: int
|
|
280
|
+
size: Size
|
|
281
|
+
uri: AnyUrl
|
|
282
|
+
_pil: Optional[PILImage.Image] = None
|
|
283
|
+
|
|
284
|
+
@property
|
|
285
|
+
def pil_image(self) -> PILImage.Image:
|
|
286
|
+
"""Return the PIL Image."""
|
|
287
|
+
if self._pil is not None:
|
|
288
|
+
return self._pil
|
|
289
|
+
|
|
290
|
+
if str(self.uri).startswith("data:"):
|
|
291
|
+
encoded_img = str(self.uri).split(",")[1]
|
|
292
|
+
decoded_img = base64.b64decode(encoded_img)
|
|
293
|
+
self._pil = PILImage.open(BytesIO(decoded_img))
|
|
294
|
+
else:
|
|
295
|
+
self._pil = PILImage.open(str(self.uri))
|
|
296
|
+
|
|
297
|
+
return self._pil
|
|
298
|
+
|
|
299
|
+
@field_validator("mimetype")
|
|
390
300
|
@classmethod
|
|
391
|
-
def
|
|
392
|
-
"""
|
|
393
|
-
if
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
if ref := item.pop("__ref", None):
|
|
398
|
-
item["$ref"] = ref
|
|
301
|
+
def validate_mimetype(cls, v):
|
|
302
|
+
"""validate_mimetype."""
|
|
303
|
+
# Check if the provided MIME type is valid using mimetypes module
|
|
304
|
+
if v not in mimetypes.types_map.values():
|
|
305
|
+
raise ValueError(f"'{v}' is not a valid MIME type")
|
|
306
|
+
return v
|
|
399
307
|
|
|
400
|
-
|
|
308
|
+
@classmethod
|
|
309
|
+
def from_pil(cls, image: PILImage.Image, dpi: int) -> Self:
|
|
310
|
+
"""Construct ImageRef from a PIL Image."""
|
|
311
|
+
buffered = BytesIO()
|
|
312
|
+
image.save(buffered, format="PNG")
|
|
313
|
+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
|
314
|
+
img_uri = f"data:image/png;base64,{img_str}"
|
|
315
|
+
return cls(
|
|
316
|
+
mimetype="image/png",
|
|
317
|
+
dpi=dpi,
|
|
318
|
+
size=Size(width=image.width, height=image.height),
|
|
319
|
+
uri=img_uri,
|
|
320
|
+
_pil=image,
|
|
321
|
+
)
|
|
401
322
|
|
|
402
|
-
def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
|
|
403
|
-
"""Return the resolved reference.
|
|
404
323
|
|
|
405
|
-
|
|
406
|
-
|
|
324
|
+
class ProvenanceItem(BaseModel):
|
|
325
|
+
"""ProvenanceItem."""
|
|
326
|
+
|
|
327
|
+
page_no: int
|
|
328
|
+
bbox: BoundingBox
|
|
329
|
+
charspan: Tuple[int, int]
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
class NodeItem(BaseModel):
|
|
333
|
+
"""NodeItem."""
|
|
334
|
+
|
|
335
|
+
self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
|
|
336
|
+
parent: Optional[RefItem] = None
|
|
337
|
+
children: List[RefItem] = []
|
|
338
|
+
|
|
339
|
+
model_config = ConfigDict(extra="forbid")
|
|
340
|
+
|
|
341
|
+
def get_ref(self):
|
|
342
|
+
"""get_ref."""
|
|
343
|
+
return RefItem(cref=self.self_ref)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
class GroupItem(NodeItem): # Container type, can't be a leaf node
|
|
347
|
+
"""GroupItem."""
|
|
348
|
+
|
|
349
|
+
name: str = (
|
|
350
|
+
"group" # Name of the group, e.g. "Introduction Chapter",
|
|
351
|
+
# "Slide 5", "Navigation menu list", ...
|
|
352
|
+
)
|
|
353
|
+
label: GroupLabel = GroupLabel.UNSPECIFIED
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
class DocItem(
|
|
357
|
+
NodeItem
|
|
358
|
+
): # Base type for any element that carries content, can be a leaf node
|
|
359
|
+
"""DocItem."""
|
|
360
|
+
|
|
361
|
+
label: DocItemLabel
|
|
362
|
+
prov: List[ProvenanceItem] = []
|
|
363
|
+
|
|
364
|
+
def get_location_tokens(
|
|
365
|
+
self,
|
|
366
|
+
doc: "DoclingDocument",
|
|
367
|
+
new_line: str,
|
|
368
|
+
xsize: int = 100,
|
|
369
|
+
ysize: int = 100,
|
|
370
|
+
add_page_index: bool = True,
|
|
371
|
+
) -> str:
|
|
372
|
+
"""Get the location string for the BaseCell."""
|
|
373
|
+
if not len(self.prov):
|
|
374
|
+
return ""
|
|
375
|
+
|
|
376
|
+
location = ""
|
|
377
|
+
for prov in self.prov:
|
|
378
|
+
page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
|
|
379
|
+
|
|
380
|
+
page_i = -1
|
|
381
|
+
if add_page_index:
|
|
382
|
+
page_i = prov.page_no
|
|
383
|
+
|
|
384
|
+
loc_str = DocumentToken.get_location(
|
|
385
|
+
bbox=prov.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
386
|
+
page_w=page_w,
|
|
387
|
+
page_h=page_h,
|
|
388
|
+
xsize=xsize,
|
|
389
|
+
ysize=ysize,
|
|
390
|
+
page_i=page_i,
|
|
391
|
+
)
|
|
392
|
+
location += f"{loc_str}{new_line}"
|
|
393
|
+
|
|
394
|
+
return location
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
class TextItem(DocItem):
|
|
398
|
+
"""TextItem."""
|
|
399
|
+
|
|
400
|
+
orig: str # untreated representation
|
|
401
|
+
text: str # sanitized representation
|
|
402
|
+
|
|
403
|
+
def export_to_document_tokens(
|
|
404
|
+
self,
|
|
405
|
+
doc: "DoclingDocument",
|
|
406
|
+
new_line: str = "\n",
|
|
407
|
+
xsize: int = 100,
|
|
408
|
+
ysize: int = 100,
|
|
409
|
+
add_location: bool = True,
|
|
410
|
+
add_content: bool = True,
|
|
411
|
+
add_page_index: bool = True,
|
|
412
|
+
):
|
|
413
|
+
r"""Export text element to document tokens format.
|
|
414
|
+
|
|
415
|
+
:param doc: "DoclingDocument":
|
|
416
|
+
:param new_line: str: (Default value = "\n")
|
|
417
|
+
:param xsize: int: (Default value = 100)
|
|
418
|
+
:param ysize: int: (Default value = 100)
|
|
419
|
+
:param add_location: bool: (Default value = True)
|
|
420
|
+
:param add_content: bool: (Default value = True)
|
|
421
|
+
:param add_page_index: bool: (Default value = True)
|
|
422
|
+
|
|
407
423
|
"""
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
# NOTE: currently only resolves refs explicitely, such that we can make
|
|
411
|
-
# assumptions on ref parts
|
|
412
|
-
if item.obj_type == "table" and self.tables:
|
|
413
|
-
parts = item.ref.split("/")
|
|
414
|
-
result = self.tables[int(parts[2])]
|
|
415
|
-
elif item.obj_type == "figure" and self.figures:
|
|
416
|
-
parts = item.ref.split("/")
|
|
417
|
-
result = self.figures[int(parts[2])]
|
|
418
|
-
elif item.obj_type == "equation" and self.equations:
|
|
419
|
-
parts = item.ref.split("/")
|
|
420
|
-
result = self.equations[int(parts[2])]
|
|
421
|
-
elif item.obj_type == "footnote" and self.footnotes:
|
|
422
|
-
parts = item.ref.split("/")
|
|
423
|
-
result = self.footnotes[int(parts[2])]
|
|
424
|
+
body = f"<{self.label.value}>"
|
|
424
425
|
|
|
425
|
-
|
|
426
|
+
# TODO: This must be done through an explicit mapping.
|
|
427
|
+
# assert DocumentToken.is_known_token(
|
|
428
|
+
# body
|
|
429
|
+
# ), f"failed DocumentToken.is_known_token({body})"
|
|
430
|
+
|
|
431
|
+
if add_location:
|
|
432
|
+
body += self.get_location_tokens(
|
|
433
|
+
doc=doc,
|
|
434
|
+
new_line="",
|
|
435
|
+
xsize=xsize,
|
|
436
|
+
ysize=ysize,
|
|
437
|
+
add_page_index=add_page_index,
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
if add_content and self.text is not None:
|
|
441
|
+
body += self.text.strip()
|
|
442
|
+
|
|
443
|
+
body += f"</{self.label.value}>{new_line}"
|
|
444
|
+
|
|
445
|
+
return body
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
class SectionHeaderItem(TextItem):
|
|
449
|
+
"""SectionItem."""
|
|
450
|
+
|
|
451
|
+
label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
|
|
452
|
+
level: LevelNumber
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
class ListItem(TextItem):
|
|
456
|
+
"""SectionItem."""
|
|
457
|
+
|
|
458
|
+
label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
|
|
459
|
+
enumerated: bool = False
|
|
460
|
+
marker: str # The bullet or number symbol that prefixes this list item
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
class FloatingItem(DocItem):
|
|
464
|
+
"""FloatingItem."""
|
|
465
|
+
|
|
466
|
+
captions: List[RefItem] = []
|
|
467
|
+
references: List[RefItem] = []
|
|
468
|
+
footnotes: List[RefItem] = []
|
|
469
|
+
image: Optional[ImageRef] = None
|
|
470
|
+
|
|
471
|
+
def caption_text(self, doc: "DoclingDocument") -> str:
|
|
472
|
+
"""Computes the caption as a single text."""
|
|
473
|
+
text = ""
|
|
474
|
+
for cap in self.captions:
|
|
475
|
+
text += cap.resolve(doc).text
|
|
476
|
+
return text
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
class PictureItem(FloatingItem):
|
|
480
|
+
"""PictureItem."""
|
|
481
|
+
|
|
482
|
+
label: typing.Literal[DocItemLabel.PICTURE] = DocItemLabel.PICTURE
|
|
483
|
+
|
|
484
|
+
annotations: List[PictureDataType] = []
|
|
485
|
+
|
|
486
|
+
def export_to_document_tokens(
|
|
487
|
+
self,
|
|
488
|
+
doc: "DoclingDocument",
|
|
489
|
+
new_line: str = "\n",
|
|
490
|
+
xsize: int = 100,
|
|
491
|
+
ysize: int = 100,
|
|
492
|
+
add_location: bool = True,
|
|
493
|
+
add_caption: bool = True,
|
|
494
|
+
add_content: bool = True, # not used at the moment
|
|
495
|
+
add_page_index: bool = True,
|
|
496
|
+
):
|
|
497
|
+
r"""Export picture to document tokens format.
|
|
498
|
+
|
|
499
|
+
:param doc: "DoclingDocument":
|
|
500
|
+
:param new_line: str: (Default value = "\n")
|
|
501
|
+
:param xsize: int: (Default value = 100)
|
|
502
|
+
:param ysize: int: (Default value = 100)
|
|
503
|
+
:param add_location: bool: (Default value = True)
|
|
504
|
+
:param add_caption: bool: (Default value = True)
|
|
505
|
+
:param add_content: bool: (Default value = True)
|
|
506
|
+
:param # not used at the momentadd_page_index: bool: (Default value = True)
|
|
507
|
+
|
|
508
|
+
"""
|
|
509
|
+
body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
|
|
510
|
+
|
|
511
|
+
if add_location:
|
|
512
|
+
body += self.get_location_tokens(
|
|
513
|
+
doc=doc,
|
|
514
|
+
new_line=new_line,
|
|
515
|
+
xsize=xsize,
|
|
516
|
+
ysize=ysize,
|
|
517
|
+
add_page_index=add_page_index,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
if add_caption and len(self.captions):
|
|
521
|
+
text = self.caption_text(doc)
|
|
522
|
+
|
|
523
|
+
if len(text):
|
|
524
|
+
body += f"{DocumentToken.BEG_CAPTION.value}"
|
|
525
|
+
body += f"{text.strip()}"
|
|
526
|
+
body += f"{DocumentToken.END_CAPTION.value}"
|
|
527
|
+
body += f"{new_line}"
|
|
528
|
+
|
|
529
|
+
body += f"{DocumentToken.END_FIGURE.value}{new_line}"
|
|
530
|
+
|
|
531
|
+
return body
|
|
426
532
|
|
|
427
|
-
def get_map_to_page_dimensions(self):
|
|
428
|
-
"""Get a map from page-index (start at 1) to page-dim [width, height]."""
|
|
429
|
-
pagedims = {}
|
|
430
533
|
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
pagedims[_.page] = [_.width, _.height]
|
|
534
|
+
class TableItem(FloatingItem):
|
|
535
|
+
"""TableItem."""
|
|
434
536
|
|
|
435
|
-
|
|
537
|
+
data: TableData
|
|
538
|
+
label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
|
|
539
|
+
|
|
540
|
+
def export_to_dataframe(self) -> pd.DataFrame:
|
|
541
|
+
"""Export the table as a Pandas DataFrame."""
|
|
542
|
+
if self.data.num_rows == 0 or self.data.num_cols == 0:
|
|
543
|
+
return pd.DataFrame()
|
|
544
|
+
|
|
545
|
+
# Count how many rows are column headers
|
|
546
|
+
num_headers = 0
|
|
547
|
+
for i, row in enumerate(self.data.grid):
|
|
548
|
+
if len(row) == 0:
|
|
549
|
+
raise RuntimeError(
|
|
550
|
+
f"Invalid table. {len(row)=} but {self.data.num_cols=}."
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
any_header = False
|
|
554
|
+
for cell in row:
|
|
555
|
+
if cell.column_header:
|
|
556
|
+
any_header = True
|
|
557
|
+
break
|
|
558
|
+
|
|
559
|
+
if any_header:
|
|
560
|
+
num_headers += 1
|
|
561
|
+
else:
|
|
562
|
+
break
|
|
563
|
+
|
|
564
|
+
# Create the column names from all col_headers
|
|
565
|
+
columns: Optional[List[str]] = None
|
|
566
|
+
if num_headers > 0:
|
|
567
|
+
columns = ["" for _ in range(self.data.num_cols)]
|
|
568
|
+
for i in range(num_headers):
|
|
569
|
+
for j, cell in enumerate(self.data.grid[i]):
|
|
570
|
+
col_name = cell.text
|
|
571
|
+
if columns[j] != "":
|
|
572
|
+
col_name = f".{col_name}"
|
|
573
|
+
columns[j] += col_name
|
|
574
|
+
|
|
575
|
+
# Create table data
|
|
576
|
+
table_data = [
|
|
577
|
+
[cell.text for cell in row] for row in self.data.grid[num_headers:]
|
|
578
|
+
]
|
|
579
|
+
|
|
580
|
+
# Create DataFrame
|
|
581
|
+
df = pd.DataFrame(table_data, columns=columns)
|
|
582
|
+
|
|
583
|
+
return df
|
|
584
|
+
|
|
585
|
+
def export_to_markdown(self) -> str:
|
|
586
|
+
"""Export the table as markdown."""
|
|
587
|
+
table = []
|
|
588
|
+
for row in self.data.grid:
|
|
589
|
+
tmp = []
|
|
590
|
+
for col in row:
|
|
591
|
+
tmp.append(col.text)
|
|
592
|
+
table.append(tmp)
|
|
593
|
+
|
|
594
|
+
md_table = ""
|
|
595
|
+
if len(table) > 1 and len(table[0]) > 0:
|
|
596
|
+
try:
|
|
597
|
+
md_table = tabulate(table[1:], headers=table[0], tablefmt="github")
|
|
598
|
+
except ValueError:
|
|
599
|
+
md_table = tabulate(
|
|
600
|
+
table[1:],
|
|
601
|
+
headers=table[0],
|
|
602
|
+
tablefmt="github",
|
|
603
|
+
disable_numparse=True,
|
|
604
|
+
)
|
|
605
|
+
return md_table
|
|
606
|
+
|
|
607
|
+
def export_to_html(self) -> str:
|
|
608
|
+
"""Export the table as html."""
|
|
609
|
+
body = ""
|
|
610
|
+
nrows = self.data.num_rows
|
|
611
|
+
ncols = self.data.num_cols
|
|
612
|
+
|
|
613
|
+
if not len(self.data.table_cells):
|
|
614
|
+
return ""
|
|
615
|
+
for i in range(nrows):
|
|
616
|
+
body += "<tr>"
|
|
617
|
+
for j in range(ncols):
|
|
618
|
+
cell: TableCell = self.data.grid[i][j]
|
|
619
|
+
|
|
620
|
+
rowspan, rowstart = (
|
|
621
|
+
cell.row_span,
|
|
622
|
+
cell.start_row_offset_idx,
|
|
623
|
+
)
|
|
624
|
+
colspan, colstart = (
|
|
625
|
+
cell.col_span,
|
|
626
|
+
cell.start_col_offset_idx,
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
if rowstart != i:
|
|
630
|
+
continue
|
|
631
|
+
if colstart != j:
|
|
632
|
+
continue
|
|
633
|
+
|
|
634
|
+
content = cell.text.strip()
|
|
635
|
+
celltag = "td"
|
|
636
|
+
if cell.column_header:
|
|
637
|
+
celltag = "th"
|
|
638
|
+
|
|
639
|
+
opening_tag = f"{celltag}"
|
|
640
|
+
if rowspan > 1:
|
|
641
|
+
opening_tag += f' rowspan="{rowspan}"'
|
|
642
|
+
if colspan > 1:
|
|
643
|
+
opening_tag += f' colspan="{colspan}"'
|
|
644
|
+
|
|
645
|
+
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
646
|
+
body += "</tr>"
|
|
647
|
+
body = f"<table>{body}</table>"
|
|
648
|
+
|
|
649
|
+
return body
|
|
650
|
+
|
|
651
|
+
def export_to_document_tokens(
|
|
652
|
+
self,
|
|
653
|
+
doc: "DoclingDocument",
|
|
654
|
+
new_line: str = "\n",
|
|
655
|
+
xsize: int = 100,
|
|
656
|
+
ysize: int = 100,
|
|
657
|
+
add_location: bool = True,
|
|
658
|
+
add_caption: bool = True,
|
|
659
|
+
add_content: bool = True,
|
|
660
|
+
add_cell_location: bool = True,
|
|
661
|
+
add_cell_label: bool = True,
|
|
662
|
+
add_cell_text: bool = True,
|
|
663
|
+
add_page_index: bool = True,
|
|
664
|
+
):
|
|
665
|
+
r"""Export table to document tokens format.
|
|
666
|
+
|
|
667
|
+
:param doc: "DoclingDocument":
|
|
668
|
+
:param new_line: str: (Default value = "\n")
|
|
669
|
+
:param xsize: int: (Default value = 100)
|
|
670
|
+
:param ysize: int: (Default value = 100)
|
|
671
|
+
:param add_location: bool: (Default value = True)
|
|
672
|
+
:param add_caption: bool: (Default value = True)
|
|
673
|
+
:param add_content: bool: (Default value = True)
|
|
674
|
+
:param add_cell_location: bool: (Default value = True)
|
|
675
|
+
:param add_cell_label: bool: (Default value = True)
|
|
676
|
+
:param add_cell_text: bool: (Default value = True)
|
|
677
|
+
:param add_page_index: bool: (Default value = True)
|
|
678
|
+
|
|
679
|
+
"""
|
|
680
|
+
body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
|
|
681
|
+
|
|
682
|
+
if add_location:
|
|
683
|
+
body += self.get_location_tokens(
|
|
684
|
+
doc=doc,
|
|
685
|
+
new_line=new_line,
|
|
686
|
+
xsize=xsize,
|
|
687
|
+
ysize=ysize,
|
|
688
|
+
add_page_index=add_page_index,
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
if add_caption and len(self.captions):
|
|
692
|
+
text = self.caption_text(doc)
|
|
693
|
+
|
|
694
|
+
if len(text):
|
|
695
|
+
body += f"{DocumentToken.BEG_CAPTION.value}"
|
|
696
|
+
body += f"{text.strip()}"
|
|
697
|
+
body += f"{DocumentToken.END_CAPTION.value}"
|
|
698
|
+
body += f"{new_line}"
|
|
699
|
+
|
|
700
|
+
if add_content and len(self.data.table_cells) > 0:
|
|
701
|
+
for i, row in enumerate(self.data.grid):
|
|
702
|
+
body += f"<row_{i}>"
|
|
703
|
+
for j, col in enumerate(row):
|
|
704
|
+
|
|
705
|
+
text = ""
|
|
706
|
+
if add_cell_text:
|
|
707
|
+
text = col.text.strip()
|
|
708
|
+
|
|
709
|
+
cell_loc = ""
|
|
710
|
+
if (
|
|
711
|
+
col.bbox is not None
|
|
712
|
+
and add_cell_location
|
|
713
|
+
and add_page_index
|
|
714
|
+
and len(self.prov) > 0
|
|
715
|
+
):
|
|
716
|
+
page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
|
|
717
|
+
cell_loc = DocumentToken.get_location(
|
|
718
|
+
bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
719
|
+
page_w=page_w,
|
|
720
|
+
page_h=page_h,
|
|
721
|
+
xsize=xsize,
|
|
722
|
+
ysize=ysize,
|
|
723
|
+
page_i=self.prov[0].page_no,
|
|
724
|
+
)
|
|
725
|
+
elif (
|
|
726
|
+
col.bbox is not None
|
|
727
|
+
and add_cell_location
|
|
728
|
+
and not add_page_index
|
|
729
|
+
and len(self.prov) > 0
|
|
730
|
+
):
|
|
731
|
+
page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
|
|
732
|
+
|
|
733
|
+
cell_loc = DocumentToken.get_location(
|
|
734
|
+
bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
735
|
+
page_w=page_w,
|
|
736
|
+
page_h=page_h,
|
|
737
|
+
xsize=xsize,
|
|
738
|
+
ysize=ysize,
|
|
739
|
+
page_i=-1,
|
|
740
|
+
)
|
|
741
|
+
|
|
742
|
+
cell_label = ""
|
|
743
|
+
if add_cell_label:
|
|
744
|
+
if col.column_header:
|
|
745
|
+
cell_label = "<col_header>"
|
|
746
|
+
elif col.row_header:
|
|
747
|
+
cell_label = "<row_header>"
|
|
748
|
+
elif col.row_section:
|
|
749
|
+
cell_label = "<row_section>"
|
|
750
|
+
else:
|
|
751
|
+
cell_label = "<body>"
|
|
752
|
+
|
|
753
|
+
body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
|
|
754
|
+
|
|
755
|
+
body += f"</row_{i}>{new_line}"
|
|
756
|
+
|
|
757
|
+
body += f"{DocumentToken.END_TABLE.value}{new_line}"
|
|
758
|
+
|
|
759
|
+
return body
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
class KeyValueItem(DocItem):
|
|
763
|
+
"""KeyValueItem."""
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
ContentItem = Union[
|
|
767
|
+
TextItem, SectionHeaderItem, ListItem, PictureItem, TableItem, KeyValueItem
|
|
768
|
+
]
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
class PageItem(BaseModel):
|
|
772
|
+
"""PageItem."""
|
|
773
|
+
|
|
774
|
+
# A page carries separate root items for furniture and body,
|
|
775
|
+
# only referencing items on the page
|
|
776
|
+
size: Size
|
|
777
|
+
image: Optional[ImageRef] = None
|
|
778
|
+
page_no: int
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
class DoclingDocument(BaseModel):
|
|
782
|
+
"""DoclingDocument."""
|
|
783
|
+
|
|
784
|
+
schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
|
|
785
|
+
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
|
|
786
|
+
CURRENT_VERSION
|
|
787
|
+
)
|
|
788
|
+
name: str # The working name of this document, without extensions
|
|
789
|
+
# (could be taken from originating doc, or just "Untitled 1")
|
|
790
|
+
origin: Optional[DocumentOrigin] = (
|
|
791
|
+
None # DoclingDocuments may specify an origin (converted to DoclingDocument).
|
|
792
|
+
# This is optional, e.g. a DoclingDocument could also be entirely
|
|
793
|
+
# generated from synthetic data.
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
furniture: GroupItem = GroupItem(
|
|
797
|
+
name="_root_", self_ref="#/furniture"
|
|
798
|
+
) # List[RefItem] = []
|
|
799
|
+
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
|
|
800
|
+
|
|
801
|
+
groups: List[GroupItem] = []
|
|
802
|
+
texts: List[Union[SectionHeaderItem, ListItem, TextItem]] = []
|
|
803
|
+
pictures: List[PictureItem] = []
|
|
804
|
+
tables: List[TableItem] = []
|
|
805
|
+
key_value_items: List[KeyValueItem] = []
|
|
806
|
+
|
|
807
|
+
pages: Dict[int, PageItem] = {} # empty as default
|
|
808
|
+
|
|
809
|
+
def add_group(
|
|
810
|
+
self,
|
|
811
|
+
label: Optional[GroupLabel] = None,
|
|
812
|
+
name: Optional[str] = None,
|
|
813
|
+
parent: Optional[GroupItem] = None,
|
|
814
|
+
) -> GroupItem:
|
|
815
|
+
"""add_group.
|
|
816
|
+
|
|
817
|
+
:param label: Optional[GroupLabel]: (Default value = None)
|
|
818
|
+
:param name: Optional[str]: (Default value = None)
|
|
819
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
820
|
+
|
|
821
|
+
"""
|
|
822
|
+
if not parent:
|
|
823
|
+
parent = self.body
|
|
824
|
+
|
|
825
|
+
group_index = len(self.groups)
|
|
826
|
+
cref = f"#/groups/{group_index}"
|
|
827
|
+
|
|
828
|
+
group = GroupItem(self_ref=cref, parent=parent.get_ref())
|
|
829
|
+
if name is not None:
|
|
830
|
+
group.name = name
|
|
831
|
+
if label is not None:
|
|
832
|
+
group.label = label
|
|
833
|
+
|
|
834
|
+
self.groups.append(group)
|
|
835
|
+
parent.children.append(RefItem(cref=cref))
|
|
836
|
+
|
|
837
|
+
return group
|
|
838
|
+
|
|
839
|
+
def add_list_item(
|
|
840
|
+
self,
|
|
841
|
+
text: str,
|
|
842
|
+
enumerated: bool = False,
|
|
843
|
+
marker: Optional[str] = None,
|
|
844
|
+
orig: Optional[str] = None,
|
|
845
|
+
prov: Optional[ProvenanceItem] = None,
|
|
846
|
+
parent: Optional[GroupItem] = None,
|
|
847
|
+
):
|
|
848
|
+
"""add_paragraph.
|
|
849
|
+
|
|
850
|
+
:param label: str:
|
|
851
|
+
:param text: str:
|
|
852
|
+
:param orig: Optional[str]: (Default value = None)
|
|
853
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
854
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
855
|
+
|
|
856
|
+
"""
|
|
857
|
+
if not parent:
|
|
858
|
+
parent = self.body
|
|
859
|
+
|
|
860
|
+
if not orig:
|
|
861
|
+
orig = text
|
|
862
|
+
|
|
863
|
+
marker = marker or "-"
|
|
864
|
+
|
|
865
|
+
text_index = len(self.texts)
|
|
866
|
+
cref = f"#/texts/{text_index}"
|
|
867
|
+
list_item = ListItem(
|
|
868
|
+
text=text,
|
|
869
|
+
orig=orig,
|
|
870
|
+
self_ref=cref,
|
|
871
|
+
parent=parent.get_ref(),
|
|
872
|
+
enumerated=enumerated,
|
|
873
|
+
marker=marker,
|
|
874
|
+
)
|
|
875
|
+
if prov:
|
|
876
|
+
list_item.prov.append(prov)
|
|
877
|
+
|
|
878
|
+
self.texts.append(list_item)
|
|
879
|
+
parent.children.append(RefItem(cref=cref))
|
|
880
|
+
|
|
881
|
+
return list_item
|
|
882
|
+
|
|
883
|
+
def add_text(
|
|
884
|
+
self,
|
|
885
|
+
label: DocItemLabel,
|
|
886
|
+
text: str,
|
|
887
|
+
orig: Optional[str] = None,
|
|
888
|
+
prov: Optional[ProvenanceItem] = None,
|
|
889
|
+
parent: Optional[GroupItem] = None,
|
|
890
|
+
):
|
|
891
|
+
"""add_paragraph.
|
|
892
|
+
|
|
893
|
+
:param label: str:
|
|
894
|
+
:param text: str:
|
|
895
|
+
:param orig: Optional[str]: (Default value = None)
|
|
896
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
897
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
898
|
+
|
|
899
|
+
"""
|
|
900
|
+
if not parent:
|
|
901
|
+
parent = self.body
|
|
902
|
+
|
|
903
|
+
if not orig:
|
|
904
|
+
orig = text
|
|
905
|
+
|
|
906
|
+
text_index = len(self.texts)
|
|
907
|
+
cref = f"#/texts/{text_index}"
|
|
908
|
+
text_item = TextItem(
|
|
909
|
+
label=label,
|
|
910
|
+
text=text,
|
|
911
|
+
orig=orig,
|
|
912
|
+
self_ref=cref,
|
|
913
|
+
parent=parent.get_ref(),
|
|
914
|
+
)
|
|
915
|
+
if prov:
|
|
916
|
+
text_item.prov.append(prov)
|
|
917
|
+
|
|
918
|
+
self.texts.append(text_item)
|
|
919
|
+
parent.children.append(RefItem(cref=cref))
|
|
920
|
+
|
|
921
|
+
return text_item
|
|
922
|
+
|
|
923
|
+
def add_table(
|
|
924
|
+
self,
|
|
925
|
+
data: TableData,
|
|
926
|
+
caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
|
|
927
|
+
prov: Optional[ProvenanceItem] = None,
|
|
928
|
+
parent: Optional[GroupItem] = None,
|
|
929
|
+
):
|
|
930
|
+
"""add_table.
|
|
931
|
+
|
|
932
|
+
:param data: BaseTableData:
|
|
933
|
+
:param caption: Optional[Union[TextItem:
|
|
934
|
+
:param RefItem]]: (Default value = None)
|
|
935
|
+
:param # This is not cool yet.prov: Optional[ProvenanceItem]
|
|
936
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
937
|
+
|
|
938
|
+
"""
|
|
939
|
+
if not parent:
|
|
940
|
+
parent = self.body
|
|
941
|
+
|
|
942
|
+
table_index = len(self.tables)
|
|
943
|
+
cref = f"#/tables/{table_index}"
|
|
944
|
+
|
|
945
|
+
tbl_item = TableItem(
|
|
946
|
+
label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
|
|
947
|
+
)
|
|
948
|
+
if prov:
|
|
949
|
+
tbl_item.prov.append(prov)
|
|
950
|
+
if caption:
|
|
951
|
+
tbl_item.captions.append(caption.get_ref())
|
|
952
|
+
|
|
953
|
+
self.tables.append(tbl_item)
|
|
954
|
+
parent.children.append(RefItem(cref=cref))
|
|
955
|
+
|
|
956
|
+
return tbl_item
|
|
957
|
+
|
|
958
|
+
def add_picture(
|
|
959
|
+
self,
|
|
960
|
+
annotations: List[PictureDataType] = [],
|
|
961
|
+
image: Optional[ImageRef] = None,
|
|
962
|
+
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
963
|
+
prov: Optional[ProvenanceItem] = None,
|
|
964
|
+
parent: Optional[GroupItem] = None,
|
|
965
|
+
):
|
|
966
|
+
"""add_picture.
|
|
967
|
+
|
|
968
|
+
:param data: List[PictureData]: (Default value = [])
|
|
969
|
+
:param caption: Optional[Union[TextItem:
|
|
970
|
+
:param RefItem]]: (Default value = None)
|
|
971
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
972
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
973
|
+
|
|
974
|
+
"""
|
|
975
|
+
if not parent:
|
|
976
|
+
parent = self.body
|
|
977
|
+
|
|
978
|
+
picture_index = len(self.pictures)
|
|
979
|
+
cref = f"#/pictures/{picture_index}"
|
|
980
|
+
|
|
981
|
+
fig_item = PictureItem(
|
|
982
|
+
label=DocItemLabel.PICTURE,
|
|
983
|
+
annotations=annotations,
|
|
984
|
+
image=image,
|
|
985
|
+
self_ref=cref,
|
|
986
|
+
parent=parent.get_ref(),
|
|
987
|
+
)
|
|
988
|
+
if prov:
|
|
989
|
+
fig_item.prov.append(prov)
|
|
990
|
+
if caption:
|
|
991
|
+
fig_item.captions.append(caption.get_ref())
|
|
992
|
+
|
|
993
|
+
self.pictures.append(fig_item)
|
|
994
|
+
parent.children.append(RefItem(cref=cref))
|
|
995
|
+
|
|
996
|
+
return fig_item
|
|
997
|
+
|
|
998
|
+
def add_heading(
|
|
999
|
+
self,
|
|
1000
|
+
text: str,
|
|
1001
|
+
orig: Optional[str] = None,
|
|
1002
|
+
level: LevelNumber = 1,
|
|
1003
|
+
prov: Optional[ProvenanceItem] = None,
|
|
1004
|
+
parent: Optional[GroupItem] = None,
|
|
1005
|
+
):
|
|
1006
|
+
"""add_heading.
|
|
1007
|
+
|
|
1008
|
+
:param label: DocItemLabel:
|
|
1009
|
+
:param text: str:
|
|
1010
|
+
:param orig: Optional[str]: (Default value = None)
|
|
1011
|
+
:param level: LevelNumber: (Default value = 1)
|
|
1012
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1013
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
1014
|
+
|
|
1015
|
+
"""
|
|
1016
|
+
if not parent:
|
|
1017
|
+
parent = self.body
|
|
1018
|
+
|
|
1019
|
+
if not orig:
|
|
1020
|
+
orig = text
|
|
1021
|
+
|
|
1022
|
+
text_index = len(self.texts)
|
|
1023
|
+
cref = f"#/texts/{text_index}"
|
|
1024
|
+
section_header_item = SectionHeaderItem(
|
|
1025
|
+
level=level,
|
|
1026
|
+
text=text,
|
|
1027
|
+
orig=orig,
|
|
1028
|
+
self_ref=cref,
|
|
1029
|
+
parent=parent.get_ref(),
|
|
1030
|
+
)
|
|
1031
|
+
if prov:
|
|
1032
|
+
section_header_item.prov.append(prov)
|
|
1033
|
+
|
|
1034
|
+
self.texts.append(section_header_item)
|
|
1035
|
+
parent.children.append(RefItem(cref=cref))
|
|
1036
|
+
|
|
1037
|
+
return section_header_item
|
|
1038
|
+
|
|
1039
|
+
def num_pages(self):
|
|
1040
|
+
"""num_pages."""
|
|
1041
|
+
return len(self.pages.values())
|
|
1042
|
+
|
|
1043
|
+
def validate_tree(self, root) -> bool:
|
|
1044
|
+
"""validate_tree."""
|
|
1045
|
+
res = []
|
|
1046
|
+
for child_ref in root.children:
|
|
1047
|
+
child = child_ref.resolve(self)
|
|
1048
|
+
if child.parent.resolve(self) != root:
|
|
1049
|
+
return False
|
|
1050
|
+
res.append(self.validate_tree(child))
|
|
1051
|
+
|
|
1052
|
+
return all(res) or len(res) == 0
|
|
1053
|
+
|
|
1054
|
+
def iterate_items(
|
|
1055
|
+
self,
|
|
1056
|
+
root: Optional[NodeItem] = None,
|
|
1057
|
+
with_groups: bool = False,
|
|
1058
|
+
traverse_pictures: bool = True,
|
|
1059
|
+
page_no: Optional[int] = None,
|
|
1060
|
+
_level: int = 0, # fixed parameter, carries through the node nesting level
|
|
1061
|
+
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
1062
|
+
"""iterate_elements.
|
|
1063
|
+
|
|
1064
|
+
:param root: Optional[NodeItem]: (Default value = None)
|
|
1065
|
+
:param with_groups: bool: (Default value = False)
|
|
1066
|
+
:param traverse_pictures: bool: (Default value = True)
|
|
1067
|
+
:param page_no: Optional[int]: (Default value = None)
|
|
1068
|
+
:param _level: (Default value = 0)
|
|
1069
|
+
:param # fixed parameter:
|
|
1070
|
+
:param carries through the node nesting level:
|
|
1071
|
+
"""
|
|
1072
|
+
if not root:
|
|
1073
|
+
root = self.body
|
|
1074
|
+
|
|
1075
|
+
if not isinstance(root, GroupItem) or with_groups:
|
|
1076
|
+
if isinstance(root, DocItem):
|
|
1077
|
+
if page_no is not None:
|
|
1078
|
+
for prov in root.prov:
|
|
1079
|
+
if prov.page_no == page_no:
|
|
1080
|
+
yield root, _level
|
|
1081
|
+
else:
|
|
1082
|
+
yield root, _level
|
|
1083
|
+
else:
|
|
1084
|
+
yield root, _level
|
|
1085
|
+
|
|
1086
|
+
# Traverse children
|
|
1087
|
+
for child_ref in root.children:
|
|
1088
|
+
child = child_ref.resolve(self)
|
|
1089
|
+
|
|
1090
|
+
if isinstance(child, NodeItem):
|
|
1091
|
+
# If the child is a NodeItem, recursively traverse it
|
|
1092
|
+
if not isinstance(child, PictureItem) or traverse_pictures:
|
|
1093
|
+
yield from self.iterate_items(
|
|
1094
|
+
child, _level=_level + 1, with_groups=with_groups
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
def print_element_tree(self):
|
|
1098
|
+
"""print_element_tree."""
|
|
1099
|
+
for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
|
|
1100
|
+
if isinstance(item, GroupItem):
|
|
1101
|
+
print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
|
|
1102
|
+
elif isinstance(item, DocItem):
|
|
1103
|
+
print(" " * level, f"{ix}: {item.label.value}")
|
|
1104
|
+
|
|
1105
|
+
def export_to_dict(self) -> Dict:
|
|
1106
|
+
"""export_to_dict."""
|
|
1107
|
+
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
|
|
436
1108
|
|
|
437
1109
|
def export_to_markdown( # noqa: C901
|
|
438
1110
|
self,
|
|
439
1111
|
delim: str = "\n\n",
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
"title",
|
|
444
|
-
"subtitle-level-1",
|
|
445
|
-
"paragraph",
|
|
446
|
-
"caption",
|
|
447
|
-
"table",
|
|
448
|
-
"figure",
|
|
449
|
-
],
|
|
1112
|
+
from_element: int = 0,
|
|
1113
|
+
to_element: Optional[int] = None,
|
|
1114
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
450
1115
|
strict_text: bool = False,
|
|
451
1116
|
image_placeholder: str = "<!-- image -->",
|
|
452
1117
|
) -> str:
|
|
@@ -455,66 +1120,77 @@ class ExportedCCSDocument(
|
|
|
455
1120
|
Operates on a slice of the document's main_text as defined through arguments
|
|
456
1121
|
main_text_start and main_text_stop; defaulting to the whole main_text.
|
|
457
1122
|
|
|
458
|
-
|
|
459
|
-
delim (str, optional): Delimiter to use when concatenating the various
|
|
1123
|
+
:param delim: Delimiter to use when concatenating the various
|
|
460
1124
|
Markdown parts. Defaults to "\n\n".
|
|
461
|
-
|
|
1125
|
+
:type delim: str
|
|
1126
|
+
:param from_element: Body slicing start index (inclusive).
|
|
462
1127
|
Defaults to 0.
|
|
463
|
-
|
|
1128
|
+
:type from_element: int
|
|
1129
|
+
:param to_element: Body slicing stop index
|
|
464
1130
|
(exclusive). Defaults to None.
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
1131
|
+
:type to_element: Optional[int]
|
|
1132
|
+
:param delim: str: (Default value = "\n\n")
|
|
1133
|
+
:param from_element: int: (Default value = 0)
|
|
1134
|
+
:param to_element: Optional[int]: (Default value = None)
|
|
1135
|
+
:param labels: set[DocItemLabel]
|
|
1136
|
+
:param "subtitle-level-1":
|
|
1137
|
+
:param "paragraph":
|
|
1138
|
+
:param "caption":
|
|
1139
|
+
:param "table":
|
|
1140
|
+
:param "Text":
|
|
1141
|
+
:param "text":
|
|
1142
|
+
:param ]:
|
|
1143
|
+
:param strict_text: bool: (Default value = False)
|
|
1144
|
+
:param image_placeholder str: (Default value = "<!-- image -->")
|
|
1145
|
+
the placeholder to include to position images in the markdown.
|
|
1146
|
+
:returns: The exported Markdown representation.
|
|
1147
|
+
:rtype: str
|
|
474
1148
|
"""
|
|
475
1149
|
has_title = False
|
|
476
1150
|
prev_text = ""
|
|
477
1151
|
md_texts: list[str] = []
|
|
478
1152
|
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
)
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
1153
|
+
# collect all captions embedded in table and figure objects
|
|
1154
|
+
# to avoid repeating them
|
|
1155
|
+
embedded_captions = set()
|
|
1156
|
+
skip_count = 0
|
|
1157
|
+
for ix, (item, level) in enumerate(self.iterate_items(self.body)):
|
|
1158
|
+
if skip_count < from_element:
|
|
1159
|
+
skip_count += 1
|
|
1160
|
+
continue # skip as many items as you want
|
|
1161
|
+
|
|
1162
|
+
if to_element and ix >= to_element:
|
|
1163
|
+
break
|
|
1164
|
+
|
|
1165
|
+
if (
|
|
1166
|
+
isinstance(item, (TableItem, PictureItem))
|
|
1167
|
+
and len(item.captions) > 0
|
|
1168
|
+
and item.label in labels
|
|
1169
|
+
):
|
|
1170
|
+
caption = item.caption_text(self)
|
|
1171
|
+
if caption:
|
|
1172
|
+
embedded_captions.add(caption)
|
|
1173
|
+
|
|
1174
|
+
skip_count = 0
|
|
1175
|
+
for ix, (item, level) in enumerate(self.iterate_items(self.body)):
|
|
1176
|
+
if skip_count < from_element:
|
|
1177
|
+
skip_count += 1
|
|
1178
|
+
continue # skip as many items as you want
|
|
1179
|
+
|
|
1180
|
+
if to_element and ix >= to_element:
|
|
1181
|
+
break
|
|
1182
|
+
|
|
1183
|
+
markdown_text = ""
|
|
1184
|
+
|
|
1185
|
+
if isinstance(item, DocItem):
|
|
1186
|
+
item_type = item.label
|
|
1187
|
+
|
|
1188
|
+
if isinstance(item, TextItem) and item_type in labels:
|
|
513
1189
|
text = item.text
|
|
514
1190
|
|
|
515
1191
|
# skip captions of they are embedded in the actual
|
|
516
1192
|
# floating object
|
|
517
|
-
if item_type ==
|
|
1193
|
+
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
|
518
1194
|
continue
|
|
519
1195
|
|
|
520
1196
|
# ignore repeated text
|
|
@@ -524,7 +1200,7 @@ class ExportedCCSDocument(
|
|
|
524
1200
|
prev_text = text
|
|
525
1201
|
|
|
526
1202
|
# first title match
|
|
527
|
-
if item_type ==
|
|
1203
|
+
if item_type == DocItemLabel.TITLE and not has_title:
|
|
528
1204
|
if strict_text:
|
|
529
1205
|
markdown_text = f"{text}"
|
|
530
1206
|
else:
|
|
@@ -532,78 +1208,89 @@ class ExportedCCSDocument(
|
|
|
532
1208
|
has_title = True
|
|
533
1209
|
|
|
534
1210
|
# secondary titles
|
|
535
|
-
elif item_type in {
|
|
536
|
-
|
|
537
|
-
|
|
1211
|
+
elif item_type in {
|
|
1212
|
+
DocItemLabel.TITLE,
|
|
1213
|
+
DocItemLabel.SECTION_HEADER,
|
|
1214
|
+
} or (has_title and item_type == DocItemLabel.TITLE):
|
|
538
1215
|
if strict_text:
|
|
539
1216
|
markdown_text = f"{text}"
|
|
540
1217
|
else:
|
|
541
1218
|
markdown_text = f"## {text}"
|
|
542
1219
|
|
|
1220
|
+
# secondary titles
|
|
1221
|
+
elif isinstance(item, ListItem):
|
|
1222
|
+
if item.enumerated:
|
|
1223
|
+
marker = item.marker
|
|
1224
|
+
else:
|
|
1225
|
+
marker = "-"
|
|
1226
|
+
|
|
1227
|
+
markdown_text = f"{marker} {text}"
|
|
1228
|
+
|
|
543
1229
|
# normal text
|
|
544
1230
|
else:
|
|
545
1231
|
markdown_text = text
|
|
546
1232
|
|
|
547
|
-
elif (
|
|
548
|
-
|
|
549
|
-
and item.data
|
|
550
|
-
and item_type in main_text_labels
|
|
551
|
-
):
|
|
1233
|
+
elif isinstance(item, TableItem) and item.data and item_type in labels:
|
|
1234
|
+
parts = []
|
|
552
1235
|
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
table.append(tmp)
|
|
560
|
-
|
|
561
|
-
if len(table) > 1 and len(table[0]) > 0:
|
|
562
|
-
try:
|
|
563
|
-
md_table = tabulate(
|
|
564
|
-
table[1:], headers=table[0], tablefmt="github"
|
|
565
|
-
)
|
|
566
|
-
except ValueError:
|
|
567
|
-
md_table = tabulate(
|
|
568
|
-
table[1:],
|
|
569
|
-
headers=table[0],
|
|
570
|
-
tablefmt="github",
|
|
571
|
-
disable_numparse=True,
|
|
572
|
-
)
|
|
573
|
-
|
|
574
|
-
markdown_text = ""
|
|
575
|
-
if item.text:
|
|
576
|
-
markdown_text = item.text
|
|
1236
|
+
# Compute the caption
|
|
1237
|
+
if caption := item.caption_text(self):
|
|
1238
|
+
parts.append(caption)
|
|
1239
|
+
parts.append("\n")
|
|
1240
|
+
|
|
1241
|
+
# Rendered the item
|
|
577
1242
|
if not strict_text:
|
|
578
|
-
|
|
1243
|
+
md_table = item.export_to_markdown()
|
|
1244
|
+
if md_table:
|
|
1245
|
+
parts.append(item.export_to_markdown())
|
|
1246
|
+
|
|
1247
|
+
# Combine parts
|
|
1248
|
+
markdown_text = "\n".join(parts)
|
|
579
1249
|
|
|
580
|
-
elif isinstance(item,
|
|
1250
|
+
elif isinstance(item, PictureItem) and item_type in labels:
|
|
1251
|
+
parts = []
|
|
581
1252
|
|
|
582
|
-
|
|
583
|
-
if item.
|
|
584
|
-
|
|
1253
|
+
# Compute the caption
|
|
1254
|
+
if caption := item.caption_text(self):
|
|
1255
|
+
parts.append(caption)
|
|
1256
|
+
parts.append("\n")
|
|
1257
|
+
|
|
1258
|
+
# Rendered the item
|
|
585
1259
|
if not strict_text:
|
|
586
|
-
|
|
1260
|
+
parts.append(f"{image_placeholder}")
|
|
1261
|
+
|
|
1262
|
+
# Combine parts
|
|
1263
|
+
markdown_text = "\n".join(parts)
|
|
587
1264
|
|
|
588
|
-
|
|
589
|
-
|
|
1265
|
+
if markdown_text:
|
|
1266
|
+
md_texts.append(markdown_text)
|
|
590
1267
|
|
|
591
1268
|
result = delim.join(md_texts)
|
|
592
1269
|
return result
|
|
593
1270
|
|
|
1271
|
+
def export_to_text( # noqa: C901
|
|
1272
|
+
self,
|
|
1273
|
+
delim: str = "\n\n",
|
|
1274
|
+
from_element: int = 0,
|
|
1275
|
+
to_element: Optional[int] = None,
|
|
1276
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
1277
|
+
) -> str:
|
|
1278
|
+
"""export_to_text."""
|
|
1279
|
+
return self.export_to_markdown(
|
|
1280
|
+
delim,
|
|
1281
|
+
from_element,
|
|
1282
|
+
to_element,
|
|
1283
|
+
labels,
|
|
1284
|
+
strict_text=True,
|
|
1285
|
+
image_placeholder="",
|
|
1286
|
+
)
|
|
1287
|
+
|
|
594
1288
|
def export_to_document_tokens(
|
|
595
1289
|
self,
|
|
596
1290
|
delim: str = "\n\n",
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
"title",
|
|
601
|
-
"subtitle-level-1",
|
|
602
|
-
"paragraph",
|
|
603
|
-
"caption",
|
|
604
|
-
"table",
|
|
605
|
-
"figure",
|
|
606
|
-
],
|
|
1291
|
+
from_element: int = 0,
|
|
1292
|
+
to_element: Optional[int] = None,
|
|
1293
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
607
1294
|
xsize: int = 100,
|
|
608
1295
|
ysize: int = 100,
|
|
609
1296
|
add_location: bool = True,
|
|
@@ -616,11 +1303,23 @@ class ExportedCCSDocument(
|
|
|
616
1303
|
) -> str:
|
|
617
1304
|
r"""Exports the document content to an DocumentToken format.
|
|
618
1305
|
|
|
619
|
-
Operates on a slice of the document's
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
1306
|
+
Operates on a slice of the document's body as defined through arguments
|
|
1307
|
+
from_element and to_element; defaulting to the whole main_text.
|
|
1308
|
+
|
|
1309
|
+
:param delim: str: (Default value = "\n\n")
|
|
1310
|
+
:param from_element: int: (Default value = 0)
|
|
1311
|
+
:param to_element: Optional[int]: (Default value = None)
|
|
1312
|
+
:param labels: set[DocItemLabel]
|
|
1313
|
+
:param xsize: int: (Default value = 100)
|
|
1314
|
+
:param ysize: int: (Default value = 100)
|
|
1315
|
+
:param add_location: bool: (Default value = True)
|
|
1316
|
+
:param add_content: bool: (Default value = True)
|
|
1317
|
+
:param add_page_index: bool: (Default value = True)
|
|
1318
|
+
:param # table specific flagsadd_table_cell_location: bool
|
|
1319
|
+
:param add_table_cell_label: bool: (Default value = True)
|
|
1320
|
+
:param add_table_cell_text: bool: (Default value = True)
|
|
1321
|
+
:returns: The content of the document formatted as a DocTags string.
|
|
1322
|
+
:rtype: str
|
|
624
1323
|
"""
|
|
625
1324
|
new_line = ""
|
|
626
1325
|
if delim:
|
|
@@ -630,82 +1329,113 @@ class ExportedCCSDocument(
|
|
|
630
1329
|
|
|
631
1330
|
# pagedims = self.get_map_to_page_dimensions()
|
|
632
1331
|
|
|
633
|
-
|
|
634
|
-
|
|
1332
|
+
skip_count = 0
|
|
1333
|
+
for ix, (item, level) in enumerate(self.iterate_items(self.body)):
|
|
1334
|
+
if skip_count < from_element:
|
|
1335
|
+
skip_count += 1
|
|
1336
|
+
continue # skip as many items as you want
|
|
635
1337
|
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
if isinstance(orig_item, Ref)
|
|
639
|
-
else orig_item
|
|
640
|
-
)
|
|
1338
|
+
if to_element and ix >= to_element:
|
|
1339
|
+
break
|
|
641
1340
|
|
|
642
|
-
|
|
643
|
-
|
|
1341
|
+
if not isinstance(item, DocItem):
|
|
1342
|
+
continue
|
|
644
1343
|
|
|
645
|
-
|
|
1344
|
+
prov = item.prov
|
|
646
1345
|
|
|
647
|
-
|
|
648
|
-
page_w = 0.0
|
|
649
|
-
page_h = 0.0
|
|
1346
|
+
page_i = -1
|
|
650
1347
|
|
|
651
|
-
|
|
652
|
-
add_location
|
|
653
|
-
and self.page_dimensions is not None
|
|
654
|
-
and prov is not None
|
|
655
|
-
and len(prov) > 0
|
|
656
|
-
):
|
|
1348
|
+
if add_location and len(self.pages) and len(prov) > 0:
|
|
657
1349
|
|
|
658
|
-
|
|
659
|
-
|
|
1350
|
+
page_i = prov[0].page_no
|
|
1351
|
+
page_dim = self.pages[page_i].size
|
|
660
1352
|
|
|
661
|
-
|
|
662
|
-
|
|
1353
|
+
float(page_dim.width)
|
|
1354
|
+
float(page_dim.height)
|
|
663
1355
|
|
|
664
|
-
|
|
665
|
-
|
|
1356
|
+
item_type = item.label
|
|
1357
|
+
if isinstance(item, TextItem) and (item_type in labels):
|
|
666
1358
|
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
)
|
|
1359
|
+
doctags += item.export_to_document_tokens(
|
|
1360
|
+
doc=self,
|
|
1361
|
+
new_line=new_line,
|
|
1362
|
+
xsize=xsize,
|
|
1363
|
+
ysize=ysize,
|
|
1364
|
+
add_location=add_location,
|
|
1365
|
+
add_content=add_content,
|
|
1366
|
+
add_page_index=add_page_index,
|
|
1367
|
+
)
|
|
677
1368
|
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
)
|
|
1369
|
+
elif isinstance(item, TableItem) and (item_type in labels):
|
|
1370
|
+
|
|
1371
|
+
doctags += item.export_to_document_tokens(
|
|
1372
|
+
doc=self,
|
|
1373
|
+
new_line=new_line,
|
|
1374
|
+
xsize=xsize,
|
|
1375
|
+
ysize=ysize,
|
|
1376
|
+
add_caption=True,
|
|
1377
|
+
add_location=add_location,
|
|
1378
|
+
add_content=add_content,
|
|
1379
|
+
add_cell_location=add_table_cell_location,
|
|
1380
|
+
add_cell_label=add_table_cell_label,
|
|
1381
|
+
add_cell_text=add_table_cell_text,
|
|
1382
|
+
add_page_index=add_page_index,
|
|
1383
|
+
)
|
|
694
1384
|
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
)
|
|
1385
|
+
elif isinstance(item, PictureItem) and (item_type in labels):
|
|
1386
|
+
|
|
1387
|
+
doctags += item.export_to_document_tokens(
|
|
1388
|
+
doc=self,
|
|
1389
|
+
new_line=new_line,
|
|
1390
|
+
xsize=xsize,
|
|
1391
|
+
ysize=ysize,
|
|
1392
|
+
add_caption=True,
|
|
1393
|
+
add_location=add_location,
|
|
1394
|
+
add_content=add_content,
|
|
1395
|
+
add_page_index=add_page_index,
|
|
1396
|
+
)
|
|
708
1397
|
|
|
709
1398
|
doctags += DocumentToken.END_DOCUMENT.value
|
|
710
1399
|
|
|
711
1400
|
return doctags
|
|
1401
|
+
|
|
1402
|
+
def add_page(
|
|
1403
|
+
self, page_no: int, size: Size, image: Optional[ImageRef] = None
|
|
1404
|
+
) -> PageItem:
|
|
1405
|
+
"""add_page.
|
|
1406
|
+
|
|
1407
|
+
:param page_no: int:
|
|
1408
|
+
:param size: Size:
|
|
1409
|
+
|
|
1410
|
+
"""
|
|
1411
|
+
pitem = PageItem(page_no=page_no, size=size, image=image)
|
|
1412
|
+
|
|
1413
|
+
self.pages[page_no] = pitem
|
|
1414
|
+
return pitem
|
|
1415
|
+
|
|
1416
|
+
@field_validator("version")
|
|
1417
|
+
@classmethod
|
|
1418
|
+
def check_version_is_compatible(cls, v: str) -> str:
|
|
1419
|
+
"""Check if this document version is compatible with current version."""
|
|
1420
|
+
current_match = re.match(VERSION_PATTERN, CURRENT_VERSION)
|
|
1421
|
+
doc_match = re.match(VERSION_PATTERN, v)
|
|
1422
|
+
if (
|
|
1423
|
+
doc_match is None
|
|
1424
|
+
or current_match is None
|
|
1425
|
+
or doc_match["major"] != current_match["major"]
|
|
1426
|
+
or doc_match["minor"] > current_match["minor"]
|
|
1427
|
+
):
|
|
1428
|
+
raise ValueError(
|
|
1429
|
+
f"incompatible version {v} with schema version {CURRENT_VERSION}"
|
|
1430
|
+
)
|
|
1431
|
+
else:
|
|
1432
|
+
return CURRENT_VERSION
|
|
1433
|
+
|
|
1434
|
+
@model_validator(mode="after") # type: ignore
|
|
1435
|
+
@classmethod
|
|
1436
|
+
def validate_document(cls, d: "DoclingDocument"):
|
|
1437
|
+
"""validate_document."""
|
|
1438
|
+
if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
|
|
1439
|
+
raise ValueError("Document hierachy is inconsistent.")
|
|
1440
|
+
|
|
1441
|
+
return d
|