docling-core 1.7.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/__init__.py +2 -8
- docling_core/transforms/chunker/base.py +27 -40
- docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
- docling_core/types/__init__.py +12 -8
- docling_core/types/doc/__init__.py +25 -0
- docling_core/types/doc/base.py +136 -451
- docling_core/types/doc/document.py +1288 -559
- docling_core/types/{experimental → doc}/labels.py +4 -1
- docling_core/types/legacy_doc/__init__.py +6 -0
- docling_core/types/legacy_doc/base.py +485 -0
- docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
- docling_core/types/legacy_doc/document.py +715 -0
- docling_core/types/rec/subject.py +1 -1
- docling_core/utils/generate_docs.py +82 -0
- docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
- docling_core/utils/validators.py +3 -3
- {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/METADATA +11 -11
- {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/RECORD +24 -31
- docling_core-2.0.0.dist-info/entry_points.txt +5 -0
- docling_core/transforms/id_generator/__init__.py +0 -12
- docling_core/transforms/id_generator/base.py +0 -30
- docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
- docling_core/transforms/id_generator/uuid_generator.py +0 -34
- docling_core/transforms/metadata_extractor/__init__.py +0 -13
- docling_core/transforms/metadata_extractor/base.py +0 -59
- docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
- docling_core/types/experimental/__init__.py +0 -30
- docling_core/types/experimental/base.py +0 -167
- docling_core/types/experimental/document.py +0 -1192
- docling_core/utils/ds_generate_docs.py +0 -144
- docling_core-1.7.1.dist-info/entry_points.txt +0 -5
- /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
- {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/LICENSE +0 -0
- {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/WHEEL +0 -0
|
@@ -1,452 +1,1116 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
1
|
"""Models for the Docling Document data type."""
|
|
7
2
|
|
|
8
|
-
|
|
9
|
-
|
|
3
|
+
import base64
|
|
4
|
+
import mimetypes
|
|
5
|
+
import re
|
|
6
|
+
import typing
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
10
9
|
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from PIL import Image as PILImage
|
|
11
12
|
from pydantic import (
|
|
12
|
-
|
|
13
|
+
AnyUrl,
|
|
13
14
|
BaseModel,
|
|
15
|
+
ConfigDict,
|
|
14
16
|
Field,
|
|
15
|
-
|
|
16
|
-
|
|
17
|
+
StringConstraints,
|
|
18
|
+
computed_field,
|
|
19
|
+
field_validator,
|
|
17
20
|
model_validator,
|
|
18
21
|
)
|
|
19
22
|
from tabulate import tabulate
|
|
23
|
+
from typing_extensions import Annotated, Self
|
|
20
24
|
|
|
21
|
-
from docling_core.search.
|
|
22
|
-
from docling_core.types.base import
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
DescriptionAdvancedT,
|
|
27
|
-
DescriptionAnalyticsT,
|
|
28
|
-
FileInfoObject,
|
|
29
|
-
Identifier,
|
|
30
|
-
IdentifierTypeT,
|
|
31
|
-
LanguageT,
|
|
32
|
-
Log,
|
|
33
|
-
)
|
|
34
|
-
from docling_core.types.doc.base import (
|
|
35
|
-
BaseCell,
|
|
36
|
-
BaseText,
|
|
37
|
-
BitmapObject,
|
|
38
|
-
Figure,
|
|
39
|
-
PageDimensions,
|
|
40
|
-
PageReference,
|
|
41
|
-
Ref,
|
|
42
|
-
S3Data,
|
|
43
|
-
Table,
|
|
44
|
-
)
|
|
45
|
-
from docling_core.types.doc.tokens import DocumentToken
|
|
46
|
-
from docling_core.utils.alias import AliasModel
|
|
25
|
+
from docling_core.search.package import VERSION_PATTERN
|
|
26
|
+
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
27
|
+
from docling_core.types.doc import BoundingBox, Size
|
|
28
|
+
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
|
|
29
|
+
from docling_core.types.legacy_doc.tokens import DocumentToken
|
|
47
30
|
|
|
31
|
+
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
32
|
+
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
33
|
+
CURRENT_VERSION: Final = "1.0.0"
|
|
48
34
|
|
|
49
|
-
|
|
50
|
-
|
|
35
|
+
DEFAULT_EXPORT_LABELS = {
|
|
36
|
+
DocItemLabel.TITLE,
|
|
37
|
+
DocItemLabel.DOCUMENT_INDEX,
|
|
38
|
+
DocItemLabel.SECTION_HEADER,
|
|
39
|
+
DocItemLabel.PARAGRAPH,
|
|
40
|
+
DocItemLabel.CAPTION,
|
|
41
|
+
DocItemLabel.TABLE,
|
|
42
|
+
DocItemLabel.PICTURE,
|
|
43
|
+
DocItemLabel.FORMULA,
|
|
44
|
+
DocItemLabel.CHECKBOX_UNSELECTED,
|
|
45
|
+
DocItemLabel.CHECKBOX_SELECTED,
|
|
46
|
+
DocItemLabel.TEXT,
|
|
47
|
+
DocItemLabel.LIST_ITEM,
|
|
48
|
+
DocItemLabel.CODE,
|
|
49
|
+
}
|
|
51
50
|
|
|
52
|
-
author: Optional[list[StrictStr]] = None
|
|
53
|
-
keywords: Optional[str] = None
|
|
54
|
-
subject: Optional[str] = None
|
|
55
|
-
title: Optional[StrictStr] = None
|
|
56
|
-
creation_date: Optional[str] = None # datetime
|
|
57
51
|
|
|
52
|
+
class BasePictureData(BaseModel):
|
|
53
|
+
"""BasePictureData."""
|
|
58
54
|
|
|
59
|
-
|
|
60
|
-
"""File info object."""
|
|
55
|
+
kind: str
|
|
61
56
|
|
|
62
|
-
num_pages: Optional[int] = Field(default=None, alias="#-pages")
|
|
63
57
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
alias="collection-name",
|
|
67
|
-
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
68
|
-
)
|
|
69
|
-
description: Optional[CCSFileInfoDescription] = Field(
|
|
70
|
-
default=None, json_schema_extra=es_field(suppress=True)
|
|
71
|
-
)
|
|
72
|
-
page_hashes: Optional[list[PageReference]] = Field(
|
|
73
|
-
default=None, alias="page-hashes"
|
|
74
|
-
)
|
|
58
|
+
class PictureClassificationClass(BaseModel):
|
|
59
|
+
"""PictureClassificationData."""
|
|
75
60
|
|
|
61
|
+
class_name: str
|
|
62
|
+
confidence: float
|
|
76
63
|
|
|
77
|
-
class Affiliation(BaseModel, extra="forbid"):
|
|
78
|
-
"""Affiliation."""
|
|
79
|
-
|
|
80
|
-
name: str = Field(
|
|
81
|
-
...,
|
|
82
|
-
json_schema_extra=es_field(
|
|
83
|
-
fields={
|
|
84
|
-
"lower": {
|
|
85
|
-
"normalizer": "lowercase_asciifolding",
|
|
86
|
-
"type": "keyword",
|
|
87
|
-
"ignore_above": 8191,
|
|
88
|
-
},
|
|
89
|
-
"keyword": {"type": "keyword", "ignore_above": 8191},
|
|
90
|
-
},
|
|
91
|
-
),
|
|
92
|
-
)
|
|
93
|
-
id: Optional[str] = Field(
|
|
94
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
95
|
-
)
|
|
96
|
-
source: Optional[str] = Field(
|
|
97
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
98
|
-
)
|
|
99
64
|
|
|
65
|
+
class PictureClassificationData(BasePictureData):
|
|
66
|
+
"""PictureClassificationData."""
|
|
100
67
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
name: str = Field(
|
|
105
|
-
...,
|
|
106
|
-
json_schema_extra=es_field(
|
|
107
|
-
type="text",
|
|
108
|
-
fields={
|
|
109
|
-
"lower": {
|
|
110
|
-
"normalizer": "lowercase_asciifolding",
|
|
111
|
-
"type": "keyword",
|
|
112
|
-
"ignore_above": 8191,
|
|
113
|
-
},
|
|
114
|
-
"keyword": {"type": "keyword", "ignore_above": 8191},
|
|
115
|
-
},
|
|
116
|
-
),
|
|
117
|
-
)
|
|
118
|
-
id: Optional[str] = Field(
|
|
119
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
120
|
-
)
|
|
121
|
-
source: Optional[str] = Field(
|
|
122
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
123
|
-
)
|
|
124
|
-
affiliations: Optional[list[Affiliation]] = None
|
|
68
|
+
kind: Literal["classification"] = "classification"
|
|
69
|
+
provenance: str
|
|
70
|
+
predicted_classes: List[PictureClassificationClass]
|
|
125
71
|
|
|
126
72
|
|
|
127
|
-
class
|
|
128
|
-
"""
|
|
73
|
+
class PictureDescriptionData(BasePictureData):
|
|
74
|
+
"""PictureDescriptionData."""
|
|
129
75
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
)
|
|
134
|
-
name: StrictStr = Field(
|
|
135
|
-
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
136
|
-
description="Name of the publication.",
|
|
137
|
-
)
|
|
138
|
-
alternate_names: Optional[list[StrictStr]] = Field(
|
|
139
|
-
default=None,
|
|
140
|
-
json_schema_extra=es_field(type="text"),
|
|
141
|
-
title="Alternate Names",
|
|
142
|
-
description="Other names or abbreviations of this publication.",
|
|
143
|
-
)
|
|
144
|
-
type: Optional[list[StrictStr]] = Field(
|
|
145
|
-
default=None,
|
|
146
|
-
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
147
|
-
description="Type of publication (journal article, conference, review,...).",
|
|
148
|
-
)
|
|
149
|
-
pages: Optional[StrictStr] = Field(
|
|
150
|
-
default=None,
|
|
151
|
-
json_schema_extra=es_field(type="text"),
|
|
152
|
-
description="Page range in the publication.",
|
|
153
|
-
)
|
|
154
|
-
issue: Optional[StrictStr] = Field(
|
|
155
|
-
default=None,
|
|
156
|
-
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
157
|
-
description="Publication issue (issue number).",
|
|
158
|
-
)
|
|
159
|
-
volume: Optional[StrictStr] = Field(
|
|
160
|
-
default=None,
|
|
161
|
-
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
162
|
-
description="Publication volume.",
|
|
163
|
-
)
|
|
164
|
-
url: Optional[AnyHttpUrl] = Field(
|
|
165
|
-
default=None,
|
|
166
|
-
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
167
|
-
description="URL on the publication site.",
|
|
168
|
-
)
|
|
76
|
+
kind: Literal["description"] = "description"
|
|
77
|
+
text: str
|
|
78
|
+
provenance: str
|
|
169
79
|
|
|
170
80
|
|
|
171
|
-
class
|
|
172
|
-
"""
|
|
81
|
+
class PictureMoleculeData(BaseModel):
|
|
82
|
+
"""PictureMoleculeData."""
|
|
173
83
|
|
|
174
|
-
|
|
175
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
176
|
-
)
|
|
177
|
-
text: Optional[StrictStr] = None
|
|
84
|
+
kind: Literal["molecule_data"] = "molecule_data"
|
|
178
85
|
|
|
86
|
+
smi: str
|
|
87
|
+
confidence: float
|
|
88
|
+
class_name: str
|
|
89
|
+
segmentation: List[Tuple[float, float]]
|
|
90
|
+
provenance: str
|
|
179
91
|
|
|
180
|
-
class CCSDocumentDescription(
|
|
181
|
-
AliasModel,
|
|
182
|
-
Generic[
|
|
183
|
-
DescriptionAdvancedT,
|
|
184
|
-
DescriptionAnalyticsT,
|
|
185
|
-
IdentifierTypeT,
|
|
186
|
-
LanguageT,
|
|
187
|
-
CollectionNameTypeT,
|
|
188
|
-
],
|
|
189
|
-
):
|
|
190
|
-
"""Description in document."""
|
|
191
|
-
|
|
192
|
-
title: Optional[StrictStr] = None
|
|
193
|
-
abstract: Optional[list[StrictStr]] = None
|
|
194
|
-
authors: Optional[list[Author]] = None
|
|
195
|
-
affiliations: Optional[list[Affiliation]] = None
|
|
196
|
-
subjects: Optional[list[str]] = Field(
|
|
197
|
-
default=None,
|
|
198
|
-
json_schema_extra=es_field(
|
|
199
|
-
fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
|
|
200
|
-
),
|
|
201
|
-
)
|
|
202
|
-
keywords: Optional[list[str]] = Field(
|
|
203
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
204
|
-
)
|
|
205
|
-
publication_date: Optional[datetime] = None
|
|
206
|
-
languages: Optional[list[LanguageT]] = Field(
|
|
207
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
208
|
-
)
|
|
209
|
-
license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
|
|
210
|
-
publishers: Optional[list[StrictStr]] = Field(
|
|
211
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
212
|
-
)
|
|
213
|
-
url_refs: Optional[list[str]] = Field(
|
|
214
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
215
|
-
)
|
|
216
|
-
references: Optional[list[Identifier[IdentifierTypeT]]] = None
|
|
217
|
-
publication: Optional[list[Publication]] = Field(
|
|
218
|
-
default=None, description="List of publication journals or venues."
|
|
219
|
-
)
|
|
220
|
-
reference_count: Optional[NonNegativeInt] = Field(
|
|
221
|
-
default=None,
|
|
222
|
-
title="Reference Count",
|
|
223
|
-
description="Total number of documents referenced by this document.",
|
|
224
|
-
json_schema_extra=es_field(type="integer"),
|
|
225
|
-
)
|
|
226
|
-
citation_count: Optional[NonNegativeInt] = Field(
|
|
227
|
-
default=None,
|
|
228
|
-
title="Citation Count",
|
|
229
|
-
description=(
|
|
230
|
-
"Total number of citations that this document has received (number "
|
|
231
|
-
"of documents in whose bibliography this document appears)."
|
|
232
|
-
),
|
|
233
|
-
json_schema_extra=es_field(type="integer"),
|
|
234
|
-
)
|
|
235
|
-
citation_date: Optional[datetime] = Field(
|
|
236
|
-
default=None,
|
|
237
|
-
title="Citation Count Date",
|
|
238
|
-
description="Last update date of the citation count.",
|
|
239
|
-
)
|
|
240
|
-
advanced: Optional[DescriptionAdvancedT] = None
|
|
241
|
-
analytics: Optional[DescriptionAnalyticsT] = None
|
|
242
|
-
logs: list[Log]
|
|
243
|
-
collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
|
|
244
|
-
default=None, description="The collection information of this document."
|
|
245
|
-
)
|
|
246
|
-
acquisition: Optional[Acquisition] = Field(
|
|
247
|
-
default=None,
|
|
248
|
-
description=(
|
|
249
|
-
"Information on how the document was obtained, for data governance"
|
|
250
|
-
" purposes."
|
|
251
|
-
),
|
|
252
|
-
)
|
|
253
92
|
|
|
93
|
+
class PictureMiscData(BaseModel):
|
|
94
|
+
"""PictureMiscData."""
|
|
254
95
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
"""Minimal model for a document."""
|
|
266
|
-
|
|
267
|
-
name: StrictStr = Field(alias="_name")
|
|
268
|
-
obj_type: Optional[StrictStr] = Field("document", alias="type")
|
|
269
|
-
description: CCSDocumentDescription[
|
|
270
|
-
DescriptionAdvancedT,
|
|
271
|
-
DescriptionAnalyticsT,
|
|
272
|
-
IdentifierTypeT,
|
|
273
|
-
LanguageT,
|
|
274
|
-
CollectionNameTypeT,
|
|
275
|
-
]
|
|
276
|
-
file_info: FileInfoObject = Field(alias="file-info")
|
|
277
|
-
main_text: Optional[list[Union[Ref, BaseText]]] = Field(
|
|
278
|
-
default=None, alias="main-text"
|
|
279
|
-
)
|
|
280
|
-
figures: Optional[list[Figure]] = None
|
|
281
|
-
tables: Optional[list[Table]] = None
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
class CCSDocument(
|
|
285
|
-
MinimalDocument,
|
|
286
|
-
Generic[
|
|
287
|
-
DescriptionAdvancedT,
|
|
288
|
-
DescriptionAnalyticsT,
|
|
289
|
-
IdentifierTypeT,
|
|
290
|
-
LanguageT,
|
|
291
|
-
CollectionNameTypeT,
|
|
96
|
+
kind: Literal["misc"] = "misc"
|
|
97
|
+
content: Dict[str, Any]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
PictureDataType = Annotated[
|
|
101
|
+
Union[
|
|
102
|
+
PictureClassificationData,
|
|
103
|
+
PictureDescriptionData,
|
|
104
|
+
PictureMoleculeData,
|
|
105
|
+
PictureMiscData,
|
|
292
106
|
],
|
|
293
|
-
)
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
107
|
+
Field(discriminator="kind"),
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class TableCell(BaseModel):
|
|
112
|
+
"""TableCell."""
|
|
113
|
+
|
|
114
|
+
bbox: Optional[BoundingBox] = None
|
|
115
|
+
row_span: int = 1
|
|
116
|
+
col_span: int = 1
|
|
117
|
+
start_row_offset_idx: int
|
|
118
|
+
end_row_offset_idx: int
|
|
119
|
+
start_col_offset_idx: int
|
|
120
|
+
end_col_offset_idx: int
|
|
121
|
+
text: str
|
|
122
|
+
column_header: bool = False
|
|
123
|
+
row_header: bool = False
|
|
124
|
+
row_section: bool = False
|
|
311
125
|
|
|
312
126
|
@model_validator(mode="before")
|
|
313
127
|
@classmethod
|
|
314
|
-
def
|
|
315
|
-
"""
|
|
316
|
-
if
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
128
|
+
def from_dict_format(cls, data: Any) -> Any:
|
|
129
|
+
"""from_dict_format."""
|
|
130
|
+
if isinstance(data, Dict):
|
|
131
|
+
# Check if this is a native BoundingBox or a bbox from docling-ibm-models
|
|
132
|
+
if (
|
|
133
|
+
# "bbox" not in data
|
|
134
|
+
# or data["bbox"] is None
|
|
135
|
+
# or isinstance(data["bbox"], BoundingBox)
|
|
136
|
+
"text"
|
|
137
|
+
in data
|
|
138
|
+
):
|
|
139
|
+
return data
|
|
140
|
+
text = data["bbox"].get("token", "")
|
|
141
|
+
if not len(text):
|
|
142
|
+
text_cells = data.pop("text_cell_bboxes", None)
|
|
143
|
+
if text_cells:
|
|
144
|
+
for el in text_cells:
|
|
145
|
+
text += el["token"] + " "
|
|
146
|
+
|
|
147
|
+
text = text.strip()
|
|
148
|
+
data["text"] = text
|
|
333
149
|
|
|
334
|
-
|
|
335
|
-
descr = data["description"].get(key)
|
|
336
|
-
if descr is not None and not isinstance(descr, list):
|
|
337
|
-
if isinstance(descr, dict):
|
|
338
|
-
data["description"][key] = [descr]
|
|
339
|
-
else:
|
|
340
|
-
data["description"].pop(key)
|
|
150
|
+
return data
|
|
341
151
|
|
|
342
|
-
if data.get("main-text"):
|
|
343
|
-
for item in data["main-text"]:
|
|
344
|
-
if ref := item.pop("__ref", None):
|
|
345
|
-
item["$ref"] = ref
|
|
346
152
|
|
|
347
|
-
|
|
153
|
+
class TableData(BaseModel): # TBD
|
|
154
|
+
"""BaseTableData."""
|
|
348
155
|
|
|
156
|
+
table_cells: List[TableCell] = []
|
|
157
|
+
num_rows: int = 0
|
|
158
|
+
num_cols: int = 0
|
|
349
159
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
160
|
+
@computed_field # type: ignore
|
|
161
|
+
@property
|
|
162
|
+
def grid(
|
|
163
|
+
self,
|
|
164
|
+
) -> List[List[TableCell]]:
|
|
165
|
+
"""grid."""
|
|
166
|
+
# Initialise empty table data grid (only empty cells)
|
|
167
|
+
table_data = [
|
|
168
|
+
[
|
|
169
|
+
TableCell(
|
|
170
|
+
text="",
|
|
171
|
+
start_row_offset_idx=i,
|
|
172
|
+
end_row_offset_idx=i + 1,
|
|
173
|
+
start_col_offset_idx=j,
|
|
174
|
+
end_col_offset_idx=j + 1,
|
|
175
|
+
)
|
|
176
|
+
for j in range(self.num_cols)
|
|
177
|
+
]
|
|
178
|
+
for i in range(self.num_rows)
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
# Overwrite cells in table data for which there is actual cell content.
|
|
182
|
+
for cell in self.table_cells:
|
|
183
|
+
for i in range(
|
|
184
|
+
min(cell.start_row_offset_idx, self.num_rows),
|
|
185
|
+
min(cell.end_row_offset_idx, self.num_rows),
|
|
186
|
+
):
|
|
187
|
+
for j in range(
|
|
188
|
+
min(cell.start_col_offset_idx, self.num_cols),
|
|
189
|
+
min(cell.end_col_offset_idx, self.num_cols),
|
|
190
|
+
):
|
|
191
|
+
table_data[i][j] = cell
|
|
192
|
+
|
|
193
|
+
return table_data
|
|
361
194
|
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
195
|
+
|
|
196
|
+
class DocumentOrigin(BaseModel):
|
|
197
|
+
"""FileSource."""
|
|
198
|
+
|
|
199
|
+
mimetype: str # the mimetype of the original file
|
|
200
|
+
binary_hash: Uint64 # the binary hash of the original file.
|
|
201
|
+
# TODO: Change to be Uint64 and provide utility method to generate
|
|
202
|
+
|
|
203
|
+
filename: str # The name of the original file, including extension, without path.
|
|
204
|
+
# Could stem from filesystem, source URI, Content-Disposition header, ...
|
|
205
|
+
|
|
206
|
+
uri: Optional[AnyUrl] = (
|
|
207
|
+
None # any possible reference to a source file,
|
|
208
|
+
# from any file handler protocol (e.g. https://, file://, s3://)
|
|
366
209
|
)
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
LanguageT,
|
|
375
|
-
CollectionNameTypeT,
|
|
210
|
+
|
|
211
|
+
_extra_mimetypes: typing.ClassVar[List[str]] = [
|
|
212
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
213
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
|
214
|
+
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
|
215
|
+
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
|
216
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
376
217
|
]
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
|
|
218
|
+
|
|
219
|
+
@field_validator("binary_hash", mode="before")
|
|
220
|
+
@classmethod
|
|
221
|
+
def parse_hex_string(cls, value):
|
|
222
|
+
"""parse_hex_string."""
|
|
223
|
+
if isinstance(value, str):
|
|
224
|
+
try:
|
|
225
|
+
# Convert hex string to an integer
|
|
226
|
+
hash_int = Uint64(value, 16)
|
|
227
|
+
# Mask to fit within 64 bits (unsigned)
|
|
228
|
+
return (
|
|
229
|
+
hash_int & 0xFFFFFFFFFFFFFFFF
|
|
230
|
+
) # TODO be sure it doesn't clip uint64 max
|
|
231
|
+
except ValueError:
|
|
232
|
+
raise ValueError(f"Invalid sha256 hexdigest: {value}")
|
|
233
|
+
return value # If already an int, return it as is.
|
|
234
|
+
|
|
235
|
+
@field_validator("mimetype")
|
|
236
|
+
@classmethod
|
|
237
|
+
def validate_mimetype(cls, v):
|
|
238
|
+
"""validate_mimetype."""
|
|
239
|
+
# Check if the provided MIME type is valid using mimetypes module
|
|
240
|
+
if v not in mimetypes.types_map.values() and v not in cls._extra_mimetypes:
|
|
241
|
+
raise ValueError(f"'{v}' is not a valid MIME type")
|
|
242
|
+
return v
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class RefItem(BaseModel):
|
|
246
|
+
"""RefItem."""
|
|
247
|
+
|
|
248
|
+
cref: str = Field(alias="$ref", pattern=_JSON_POINTER_REGEX)
|
|
249
|
+
|
|
250
|
+
# This method makes RefItem compatible with DocItem
|
|
251
|
+
def get_ref(self):
|
|
252
|
+
"""get_ref."""
|
|
253
|
+
return self
|
|
254
|
+
|
|
255
|
+
model_config = ConfigDict(
|
|
256
|
+
populate_by_name=True,
|
|
383
257
|
)
|
|
384
|
-
page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
|
|
385
|
-
page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
|
|
386
|
-
s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
|
|
387
|
-
identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
|
|
388
258
|
|
|
389
|
-
|
|
259
|
+
def resolve(self, doc: "DoclingDocument"):
|
|
260
|
+
"""resolve."""
|
|
261
|
+
path_components = self.cref.split("/")
|
|
262
|
+
if (num_comps := len(path_components)) == 3:
|
|
263
|
+
_, path, index_str = path_components
|
|
264
|
+
index = int(index_str)
|
|
265
|
+
obj = doc.__getattribute__(path)[index]
|
|
266
|
+
elif num_comps == 2:
|
|
267
|
+
_, path = path_components
|
|
268
|
+
obj = doc.__getattribute__(path)
|
|
269
|
+
else:
|
|
270
|
+
raise RuntimeError(f"Unsupported number of path components: {num_comps}")
|
|
271
|
+
return obj
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class ImageRef(BaseModel):
|
|
275
|
+
"""ImageRef."""
|
|
276
|
+
|
|
277
|
+
mimetype: str
|
|
278
|
+
dpi: int
|
|
279
|
+
size: Size
|
|
280
|
+
uri: AnyUrl
|
|
281
|
+
_pil: Optional[PILImage.Image] = None
|
|
282
|
+
|
|
283
|
+
@property
|
|
284
|
+
def pil_image(self) -> PILImage.Image:
|
|
285
|
+
"""Return the PIL Image."""
|
|
286
|
+
if self._pil is not None:
|
|
287
|
+
return self._pil
|
|
288
|
+
|
|
289
|
+
if str(self.uri).startswith("data:"):
|
|
290
|
+
encoded_img = str(self.uri).split(",")[1]
|
|
291
|
+
decoded_img = base64.b64decode(encoded_img)
|
|
292
|
+
self._pil = PILImage.open(BytesIO(decoded_img))
|
|
293
|
+
else:
|
|
294
|
+
self._pil = PILImage.open(str(self.uri))
|
|
295
|
+
|
|
296
|
+
return self._pil
|
|
297
|
+
|
|
298
|
+
@field_validator("mimetype")
|
|
390
299
|
@classmethod
|
|
391
|
-
def
|
|
392
|
-
"""
|
|
393
|
-
if
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
if ref := item.pop("__ref", None):
|
|
398
|
-
item["$ref"] = ref
|
|
300
|
+
def validate_mimetype(cls, v):
|
|
301
|
+
"""validate_mimetype."""
|
|
302
|
+
# Check if the provided MIME type is valid using mimetypes module
|
|
303
|
+
if v not in mimetypes.types_map.values():
|
|
304
|
+
raise ValueError(f"'{v}' is not a valid MIME type")
|
|
305
|
+
return v
|
|
399
306
|
|
|
400
|
-
|
|
307
|
+
@classmethod
|
|
308
|
+
def from_pil(cls, image: PILImage.Image, dpi: int) -> Self:
|
|
309
|
+
"""Construct ImageRef from a PIL Image."""
|
|
310
|
+
buffered = BytesIO()
|
|
311
|
+
image.save(buffered, format="PNG")
|
|
312
|
+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
|
313
|
+
img_uri = f"data:image/png;base64,{img_str}"
|
|
314
|
+
return cls(
|
|
315
|
+
mimetype="image/png",
|
|
316
|
+
dpi=dpi,
|
|
317
|
+
size=Size(width=image.width, height=image.height),
|
|
318
|
+
uri=img_uri,
|
|
319
|
+
_pil=image,
|
|
320
|
+
)
|
|
401
321
|
|
|
402
|
-
def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
|
|
403
|
-
"""Return the resolved reference.
|
|
404
322
|
|
|
405
|
-
|
|
406
|
-
|
|
323
|
+
class ProvenanceItem(BaseModel):
|
|
324
|
+
"""ProvenanceItem."""
|
|
325
|
+
|
|
326
|
+
page_no: int
|
|
327
|
+
bbox: BoundingBox
|
|
328
|
+
charspan: Tuple[int, int]
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
class NodeItem(BaseModel):
|
|
332
|
+
"""NodeItem."""
|
|
333
|
+
|
|
334
|
+
self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
|
|
335
|
+
parent: Optional[RefItem] = None
|
|
336
|
+
children: List[RefItem] = []
|
|
337
|
+
|
|
338
|
+
model_config = ConfigDict(extra="forbid")
|
|
339
|
+
|
|
340
|
+
def get_ref(self):
|
|
341
|
+
"""get_ref."""
|
|
342
|
+
return RefItem(cref=self.self_ref)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
class GroupItem(NodeItem): # Container type, can't be a leaf node
|
|
346
|
+
"""GroupItem."""
|
|
347
|
+
|
|
348
|
+
name: str = (
|
|
349
|
+
"group" # Name of the group, e.g. "Introduction Chapter",
|
|
350
|
+
# "Slide 5", "Navigation menu list", ...
|
|
351
|
+
)
|
|
352
|
+
label: GroupLabel = GroupLabel.UNSPECIFIED
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
class DocItem(
|
|
356
|
+
NodeItem
|
|
357
|
+
): # Base type for any element that carries content, can be a leaf node
|
|
358
|
+
"""DocItem."""
|
|
359
|
+
|
|
360
|
+
label: DocItemLabel
|
|
361
|
+
prov: List[ProvenanceItem] = []
|
|
362
|
+
|
|
363
|
+
def get_location_tokens(
|
|
364
|
+
self,
|
|
365
|
+
doc: "DoclingDocument",
|
|
366
|
+
new_line: str,
|
|
367
|
+
xsize: int = 100,
|
|
368
|
+
ysize: int = 100,
|
|
369
|
+
add_page_index: bool = True,
|
|
370
|
+
) -> str:
|
|
371
|
+
"""Get the location string for the BaseCell."""
|
|
372
|
+
if not len(self.prov):
|
|
373
|
+
return ""
|
|
374
|
+
|
|
375
|
+
location = ""
|
|
376
|
+
for prov in self.prov:
|
|
377
|
+
page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
|
|
378
|
+
|
|
379
|
+
page_i = -1
|
|
380
|
+
if add_page_index:
|
|
381
|
+
page_i = prov.page_no
|
|
382
|
+
|
|
383
|
+
loc_str = DocumentToken.get_location(
|
|
384
|
+
bbox=prov.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
385
|
+
page_w=page_w,
|
|
386
|
+
page_h=page_h,
|
|
387
|
+
xsize=xsize,
|
|
388
|
+
ysize=ysize,
|
|
389
|
+
page_i=page_i,
|
|
390
|
+
)
|
|
391
|
+
location += f"{loc_str}{new_line}"
|
|
392
|
+
|
|
393
|
+
return location
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
class TextItem(DocItem):
|
|
397
|
+
"""TextItem."""
|
|
398
|
+
|
|
399
|
+
orig: str # untreated representation
|
|
400
|
+
text: str # sanitized representation
|
|
401
|
+
|
|
402
|
+
def export_to_document_tokens(
|
|
403
|
+
self,
|
|
404
|
+
doc: "DoclingDocument",
|
|
405
|
+
new_line: str = "\n",
|
|
406
|
+
xsize: int = 100,
|
|
407
|
+
ysize: int = 100,
|
|
408
|
+
add_location: bool = True,
|
|
409
|
+
add_content: bool = True,
|
|
410
|
+
add_page_index: bool = True,
|
|
411
|
+
):
|
|
412
|
+
r"""Export text element to document tokens format.
|
|
413
|
+
|
|
414
|
+
:param doc: "DoclingDocument":
|
|
415
|
+
:param new_line: str: (Default value = "\n")
|
|
416
|
+
:param xsize: int: (Default value = 100)
|
|
417
|
+
:param ysize: int: (Default value = 100)
|
|
418
|
+
:param add_location: bool: (Default value = True)
|
|
419
|
+
:param add_content: bool: (Default value = True)
|
|
420
|
+
:param add_page_index: bool: (Default value = True)
|
|
421
|
+
|
|
407
422
|
"""
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
# NOTE: currently only resolves refs explicitely, such that we can make
|
|
411
|
-
# assumptions on ref parts
|
|
412
|
-
if item.obj_type == "table" and self.tables:
|
|
413
|
-
parts = item.ref.split("/")
|
|
414
|
-
result = self.tables[int(parts[2])]
|
|
415
|
-
elif item.obj_type == "figure" and self.figures:
|
|
416
|
-
parts = item.ref.split("/")
|
|
417
|
-
result = self.figures[int(parts[2])]
|
|
418
|
-
elif item.obj_type == "equation" and self.equations:
|
|
419
|
-
parts = item.ref.split("/")
|
|
420
|
-
result = self.equations[int(parts[2])]
|
|
421
|
-
elif item.obj_type == "footnote" and self.footnotes:
|
|
422
|
-
parts = item.ref.split("/")
|
|
423
|
-
result = self.footnotes[int(parts[2])]
|
|
423
|
+
body = f"<{self.label.value}>"
|
|
424
424
|
|
|
425
|
-
|
|
425
|
+
# TODO: This must be done through an explicit mapping.
|
|
426
|
+
# assert DocumentToken.is_known_token(
|
|
427
|
+
# body
|
|
428
|
+
# ), f"failed DocumentToken.is_known_token({body})"
|
|
429
|
+
|
|
430
|
+
if add_location:
|
|
431
|
+
body += self.get_location_tokens(
|
|
432
|
+
doc=doc,
|
|
433
|
+
new_line="",
|
|
434
|
+
xsize=xsize,
|
|
435
|
+
ysize=ysize,
|
|
436
|
+
add_page_index=add_page_index,
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
if add_content and self.text is not None:
|
|
440
|
+
body += self.text.strip()
|
|
441
|
+
|
|
442
|
+
body += f"</{self.label.value}>{new_line}"
|
|
443
|
+
|
|
444
|
+
return body
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
class SectionHeaderItem(TextItem):
|
|
448
|
+
"""SectionItem."""
|
|
449
|
+
|
|
450
|
+
label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
|
|
451
|
+
level: LevelNumber
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
class ListItem(TextItem):
|
|
455
|
+
"""SectionItem."""
|
|
456
|
+
|
|
457
|
+
label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
|
|
458
|
+
enumerated: bool = False
|
|
459
|
+
marker: str # The bullet or number symbol that prefixes this list item
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
class FloatingItem(DocItem):
|
|
463
|
+
"""FloatingItem."""
|
|
464
|
+
|
|
465
|
+
captions: List[RefItem] = []
|
|
466
|
+
references: List[RefItem] = []
|
|
467
|
+
footnotes: List[RefItem] = []
|
|
468
|
+
image: Optional[ImageRef] = None
|
|
469
|
+
|
|
470
|
+
def caption_text(self, doc: "DoclingDocument") -> str:
|
|
471
|
+
"""Computes the caption as a single text."""
|
|
472
|
+
text = ""
|
|
473
|
+
for cap in self.captions:
|
|
474
|
+
text += cap.resolve(doc).text
|
|
475
|
+
return text
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
class PictureItem(FloatingItem):
|
|
479
|
+
"""PictureItem."""
|
|
480
|
+
|
|
481
|
+
label: typing.Literal[DocItemLabel.PICTURE] = DocItemLabel.PICTURE
|
|
482
|
+
|
|
483
|
+
annotations: List[PictureDataType] = []
|
|
484
|
+
|
|
485
|
+
def export_to_document_tokens(
|
|
486
|
+
self,
|
|
487
|
+
doc: "DoclingDocument",
|
|
488
|
+
new_line: str = "\n",
|
|
489
|
+
xsize: int = 100,
|
|
490
|
+
ysize: int = 100,
|
|
491
|
+
add_location: bool = True,
|
|
492
|
+
add_caption: bool = True,
|
|
493
|
+
add_content: bool = True, # not used at the moment
|
|
494
|
+
add_page_index: bool = True,
|
|
495
|
+
):
|
|
496
|
+
r"""Export picture to document tokens format.
|
|
497
|
+
|
|
498
|
+
:param doc: "DoclingDocument":
|
|
499
|
+
:param new_line: str: (Default value = "\n")
|
|
500
|
+
:param xsize: int: (Default value = 100)
|
|
501
|
+
:param ysize: int: (Default value = 100)
|
|
502
|
+
:param add_location: bool: (Default value = True)
|
|
503
|
+
:param add_caption: bool: (Default value = True)
|
|
504
|
+
:param add_content: bool: (Default value = True)
|
|
505
|
+
:param # not used at the momentadd_page_index: bool: (Default value = True)
|
|
506
|
+
|
|
507
|
+
"""
|
|
508
|
+
body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
|
|
509
|
+
|
|
510
|
+
if add_location:
|
|
511
|
+
body += self.get_location_tokens(
|
|
512
|
+
doc=doc,
|
|
513
|
+
new_line=new_line,
|
|
514
|
+
xsize=xsize,
|
|
515
|
+
ysize=ysize,
|
|
516
|
+
add_page_index=add_page_index,
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
if add_caption and len(self.captions):
|
|
520
|
+
text = self.caption_text(doc)
|
|
521
|
+
|
|
522
|
+
if len(text):
|
|
523
|
+
body += f"{DocumentToken.BEG_CAPTION.value}"
|
|
524
|
+
body += f"{text.strip()}"
|
|
525
|
+
body += f"{DocumentToken.END_CAPTION.value}"
|
|
526
|
+
body += f"{new_line}"
|
|
527
|
+
|
|
528
|
+
body += f"{DocumentToken.END_FIGURE.value}{new_line}"
|
|
529
|
+
|
|
530
|
+
return body
|
|
426
531
|
|
|
427
|
-
def get_map_to_page_dimensions(self):
|
|
428
|
-
"""Get a map from page-index (start at 1) to page-dim [width, height]."""
|
|
429
|
-
pagedims = {}
|
|
430
532
|
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
pagedims[_.page] = [_.width, _.height]
|
|
533
|
+
class TableItem(FloatingItem):
|
|
534
|
+
"""TableItem."""
|
|
434
535
|
|
|
435
|
-
|
|
536
|
+
data: TableData
|
|
537
|
+
label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
|
|
538
|
+
|
|
539
|
+
def export_to_dataframe(self) -> pd.DataFrame:
|
|
540
|
+
"""Export the table as a Pandas DataFrame."""
|
|
541
|
+
if self.data.num_rows == 0 or self.data.num_cols == 0:
|
|
542
|
+
return pd.DataFrame()
|
|
543
|
+
|
|
544
|
+
# Count how many rows are column headers
|
|
545
|
+
num_headers = 0
|
|
546
|
+
for i, row in enumerate(self.data.grid):
|
|
547
|
+
if len(row) == 0:
|
|
548
|
+
raise RuntimeError(
|
|
549
|
+
f"Invalid table. {len(row)=} but {self.data.num_cols=}."
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
any_header = False
|
|
553
|
+
for cell in row:
|
|
554
|
+
if cell.column_header:
|
|
555
|
+
any_header = True
|
|
556
|
+
break
|
|
557
|
+
|
|
558
|
+
if any_header:
|
|
559
|
+
num_headers += 1
|
|
560
|
+
else:
|
|
561
|
+
break
|
|
562
|
+
|
|
563
|
+
# Create the column names from all col_headers
|
|
564
|
+
columns: Optional[List[str]] = None
|
|
565
|
+
if num_headers > 0:
|
|
566
|
+
columns = ["" for _ in range(self.data.num_cols)]
|
|
567
|
+
for i in range(num_headers):
|
|
568
|
+
for j, cell in enumerate(self.data.grid[i]):
|
|
569
|
+
col_name = cell.text
|
|
570
|
+
if columns[j] != "":
|
|
571
|
+
col_name = f".{col_name}"
|
|
572
|
+
columns[j] += col_name
|
|
573
|
+
|
|
574
|
+
# Create table data
|
|
575
|
+
table_data = [
|
|
576
|
+
[cell.text for cell in row] for row in self.data.grid[num_headers:]
|
|
577
|
+
]
|
|
578
|
+
|
|
579
|
+
# Create DataFrame
|
|
580
|
+
df = pd.DataFrame(table_data, columns=columns)
|
|
581
|
+
|
|
582
|
+
return df
|
|
583
|
+
|
|
584
|
+
def export_to_markdown(self) -> str:
|
|
585
|
+
"""Export the table as markdown."""
|
|
586
|
+
table = []
|
|
587
|
+
for row in self.data.grid:
|
|
588
|
+
tmp = []
|
|
589
|
+
for col in row:
|
|
590
|
+
tmp.append(col.text)
|
|
591
|
+
table.append(tmp)
|
|
592
|
+
|
|
593
|
+
md_table = ""
|
|
594
|
+
if len(table) > 1 and len(table[0]) > 0:
|
|
595
|
+
try:
|
|
596
|
+
md_table = tabulate(table[1:], headers=table[0], tablefmt="github")
|
|
597
|
+
except ValueError:
|
|
598
|
+
md_table = tabulate(
|
|
599
|
+
table[1:],
|
|
600
|
+
headers=table[0],
|
|
601
|
+
tablefmt="github",
|
|
602
|
+
disable_numparse=True,
|
|
603
|
+
)
|
|
604
|
+
return md_table
|
|
605
|
+
|
|
606
|
+
def export_to_html(self) -> str:
|
|
607
|
+
"""Export the table as html."""
|
|
608
|
+
body = ""
|
|
609
|
+
nrows = self.data.num_rows
|
|
610
|
+
ncols = self.data.num_cols
|
|
611
|
+
|
|
612
|
+
if not len(self.data.table_cells):
|
|
613
|
+
return ""
|
|
614
|
+
for i in range(nrows):
|
|
615
|
+
body += "<tr>"
|
|
616
|
+
for j in range(ncols):
|
|
617
|
+
cell: TableCell = self.data.grid[i][j]
|
|
618
|
+
|
|
619
|
+
rowspan, rowstart = (
|
|
620
|
+
cell.row_span,
|
|
621
|
+
cell.start_row_offset_idx,
|
|
622
|
+
)
|
|
623
|
+
colspan, colstart = (
|
|
624
|
+
cell.col_span,
|
|
625
|
+
cell.start_col_offset_idx,
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
if rowstart != i:
|
|
629
|
+
continue
|
|
630
|
+
if colstart != j:
|
|
631
|
+
continue
|
|
632
|
+
|
|
633
|
+
content = cell.text.strip()
|
|
634
|
+
celltag = "td"
|
|
635
|
+
if cell.column_header:
|
|
636
|
+
celltag = "th"
|
|
637
|
+
|
|
638
|
+
opening_tag = f"{celltag}"
|
|
639
|
+
if rowspan > 1:
|
|
640
|
+
opening_tag += f' rowspan="{rowspan}"'
|
|
641
|
+
if colspan > 1:
|
|
642
|
+
opening_tag += f' colspan="{colspan}"'
|
|
643
|
+
|
|
644
|
+
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
645
|
+
body += "</tr>"
|
|
646
|
+
body = f"<table>{body}</table>"
|
|
647
|
+
|
|
648
|
+
return body
|
|
649
|
+
|
|
650
|
+
def export_to_document_tokens(
|
|
651
|
+
self,
|
|
652
|
+
doc: "DoclingDocument",
|
|
653
|
+
new_line: str = "\n",
|
|
654
|
+
xsize: int = 100,
|
|
655
|
+
ysize: int = 100,
|
|
656
|
+
add_location: bool = True,
|
|
657
|
+
add_caption: bool = True,
|
|
658
|
+
add_content: bool = True,
|
|
659
|
+
add_cell_location: bool = True,
|
|
660
|
+
add_cell_label: bool = True,
|
|
661
|
+
add_cell_text: bool = True,
|
|
662
|
+
add_page_index: bool = True,
|
|
663
|
+
):
|
|
664
|
+
r"""Export table to document tokens format.
|
|
665
|
+
|
|
666
|
+
:param doc: "DoclingDocument":
|
|
667
|
+
:param new_line: str: (Default value = "\n")
|
|
668
|
+
:param xsize: int: (Default value = 100)
|
|
669
|
+
:param ysize: int: (Default value = 100)
|
|
670
|
+
:param add_location: bool: (Default value = True)
|
|
671
|
+
:param add_caption: bool: (Default value = True)
|
|
672
|
+
:param add_content: bool: (Default value = True)
|
|
673
|
+
:param add_cell_location: bool: (Default value = True)
|
|
674
|
+
:param add_cell_label: bool: (Default value = True)
|
|
675
|
+
:param add_cell_text: bool: (Default value = True)
|
|
676
|
+
:param add_page_index: bool: (Default value = True)
|
|
677
|
+
|
|
678
|
+
"""
|
|
679
|
+
body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
|
|
680
|
+
|
|
681
|
+
if add_location:
|
|
682
|
+
body += self.get_location_tokens(
|
|
683
|
+
doc=doc,
|
|
684
|
+
new_line=new_line,
|
|
685
|
+
xsize=xsize,
|
|
686
|
+
ysize=ysize,
|
|
687
|
+
add_page_index=add_page_index,
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
if add_caption and len(self.captions):
|
|
691
|
+
text = self.caption_text(doc)
|
|
692
|
+
|
|
693
|
+
if len(text):
|
|
694
|
+
body += f"{DocumentToken.BEG_CAPTION.value}"
|
|
695
|
+
body += f"{text.strip()}"
|
|
696
|
+
body += f"{DocumentToken.END_CAPTION.value}"
|
|
697
|
+
body += f"{new_line}"
|
|
698
|
+
|
|
699
|
+
if add_content and len(self.data.table_cells) > 0:
|
|
700
|
+
for i, row in enumerate(self.data.grid):
|
|
701
|
+
body += f"<row_{i}>"
|
|
702
|
+
for j, col in enumerate(row):
|
|
703
|
+
|
|
704
|
+
text = ""
|
|
705
|
+
if add_cell_text:
|
|
706
|
+
text = col.text.strip()
|
|
707
|
+
|
|
708
|
+
cell_loc = ""
|
|
709
|
+
if (
|
|
710
|
+
col.bbox is not None
|
|
711
|
+
and add_cell_location
|
|
712
|
+
and add_page_index
|
|
713
|
+
and len(self.prov) > 0
|
|
714
|
+
):
|
|
715
|
+
page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
|
|
716
|
+
cell_loc = DocumentToken.get_location(
|
|
717
|
+
bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
718
|
+
page_w=page_w,
|
|
719
|
+
page_h=page_h,
|
|
720
|
+
xsize=xsize,
|
|
721
|
+
ysize=ysize,
|
|
722
|
+
page_i=self.prov[0].page_no,
|
|
723
|
+
)
|
|
724
|
+
elif (
|
|
725
|
+
col.bbox is not None
|
|
726
|
+
and add_cell_location
|
|
727
|
+
and not add_page_index
|
|
728
|
+
and len(self.prov) > 0
|
|
729
|
+
):
|
|
730
|
+
page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
|
|
731
|
+
|
|
732
|
+
cell_loc = DocumentToken.get_location(
|
|
733
|
+
bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
734
|
+
page_w=page_w,
|
|
735
|
+
page_h=page_h,
|
|
736
|
+
xsize=xsize,
|
|
737
|
+
ysize=ysize,
|
|
738
|
+
page_i=-1,
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
cell_label = ""
|
|
742
|
+
if add_cell_label:
|
|
743
|
+
if col.column_header:
|
|
744
|
+
cell_label = "<col_header>"
|
|
745
|
+
elif col.row_header:
|
|
746
|
+
cell_label = "<row_header>"
|
|
747
|
+
elif col.row_section:
|
|
748
|
+
cell_label = "<row_section>"
|
|
749
|
+
else:
|
|
750
|
+
cell_label = "<body>"
|
|
751
|
+
|
|
752
|
+
body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
|
|
753
|
+
|
|
754
|
+
body += f"</row_{i}>{new_line}"
|
|
755
|
+
|
|
756
|
+
body += f"{DocumentToken.END_TABLE.value}{new_line}"
|
|
757
|
+
|
|
758
|
+
return body
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
class KeyValueItem(DocItem):
|
|
762
|
+
"""KeyValueItem."""
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
ContentItem = Union[
|
|
766
|
+
TextItem, SectionHeaderItem, ListItem, PictureItem, TableItem, KeyValueItem
|
|
767
|
+
]
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
class PageItem(BaseModel):
|
|
771
|
+
"""PageItem."""
|
|
772
|
+
|
|
773
|
+
# A page carries separate root items for furniture and body,
|
|
774
|
+
# only referencing items on the page
|
|
775
|
+
size: Size
|
|
776
|
+
image: Optional[ImageRef] = None
|
|
777
|
+
page_no: int
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
class DoclingDocument(BaseModel):
|
|
781
|
+
"""DoclingDocument."""
|
|
782
|
+
|
|
783
|
+
schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
|
|
784
|
+
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
|
|
785
|
+
CURRENT_VERSION
|
|
786
|
+
)
|
|
787
|
+
name: str # The working name of this document, without extensions
|
|
788
|
+
# (could be taken from originating doc, or just "Untitled 1")
|
|
789
|
+
origin: Optional[DocumentOrigin] = (
|
|
790
|
+
None # DoclingDocuments may specify an origin (converted to DoclingDocument).
|
|
791
|
+
# This is optional, e.g. a DoclingDocument could also be entirely
|
|
792
|
+
# generated from synthetic data.
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
furniture: GroupItem = GroupItem(
|
|
796
|
+
name="_root_", self_ref="#/furniture"
|
|
797
|
+
) # List[RefItem] = []
|
|
798
|
+
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
|
|
799
|
+
|
|
800
|
+
groups: List[GroupItem] = []
|
|
801
|
+
texts: List[Union[SectionHeaderItem, ListItem, TextItem]] = []
|
|
802
|
+
pictures: List[PictureItem] = []
|
|
803
|
+
tables: List[TableItem] = []
|
|
804
|
+
key_value_items: List[KeyValueItem] = []
|
|
805
|
+
|
|
806
|
+
pages: Dict[int, PageItem] = {} # empty as default
|
|
807
|
+
|
|
808
|
+
def add_group(
|
|
809
|
+
self,
|
|
810
|
+
label: Optional[GroupLabel] = None,
|
|
811
|
+
name: Optional[str] = None,
|
|
812
|
+
parent: Optional[GroupItem] = None,
|
|
813
|
+
) -> GroupItem:
|
|
814
|
+
"""add_group.
|
|
815
|
+
|
|
816
|
+
:param label: Optional[GroupLabel]: (Default value = None)
|
|
817
|
+
:param name: Optional[str]: (Default value = None)
|
|
818
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
819
|
+
|
|
820
|
+
"""
|
|
821
|
+
if not parent:
|
|
822
|
+
parent = self.body
|
|
823
|
+
|
|
824
|
+
group_index = len(self.groups)
|
|
825
|
+
cref = f"#/groups/{group_index}"
|
|
826
|
+
|
|
827
|
+
group = GroupItem(self_ref=cref, parent=parent.get_ref())
|
|
828
|
+
if name is not None:
|
|
829
|
+
group.name = name
|
|
830
|
+
if label is not None:
|
|
831
|
+
group.label = label
|
|
832
|
+
|
|
833
|
+
self.groups.append(group)
|
|
834
|
+
parent.children.append(RefItem(cref=cref))
|
|
835
|
+
|
|
836
|
+
return group
|
|
837
|
+
|
|
838
|
+
def add_list_item(
|
|
839
|
+
self,
|
|
840
|
+
text: str,
|
|
841
|
+
enumerated: bool = False,
|
|
842
|
+
marker: Optional[str] = None,
|
|
843
|
+
orig: Optional[str] = None,
|
|
844
|
+
prov: Optional[ProvenanceItem] = None,
|
|
845
|
+
parent: Optional[GroupItem] = None,
|
|
846
|
+
):
|
|
847
|
+
"""add_paragraph.
|
|
848
|
+
|
|
849
|
+
:param label: str:
|
|
850
|
+
:param text: str:
|
|
851
|
+
:param orig: Optional[str]: (Default value = None)
|
|
852
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
853
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
854
|
+
|
|
855
|
+
"""
|
|
856
|
+
if not parent:
|
|
857
|
+
parent = self.body
|
|
858
|
+
|
|
859
|
+
if not orig:
|
|
860
|
+
orig = text
|
|
861
|
+
|
|
862
|
+
marker = marker or "-"
|
|
863
|
+
|
|
864
|
+
text_index = len(self.texts)
|
|
865
|
+
cref = f"#/texts/{text_index}"
|
|
866
|
+
list_item = ListItem(
|
|
867
|
+
text=text,
|
|
868
|
+
orig=orig,
|
|
869
|
+
self_ref=cref,
|
|
870
|
+
parent=parent.get_ref(),
|
|
871
|
+
enumerated=enumerated,
|
|
872
|
+
marker=marker,
|
|
873
|
+
)
|
|
874
|
+
if prov:
|
|
875
|
+
list_item.prov.append(prov)
|
|
876
|
+
|
|
877
|
+
self.texts.append(list_item)
|
|
878
|
+
parent.children.append(RefItem(cref=cref))
|
|
879
|
+
|
|
880
|
+
return list_item
|
|
881
|
+
|
|
882
|
+
def add_text(
|
|
883
|
+
self,
|
|
884
|
+
label: DocItemLabel,
|
|
885
|
+
text: str,
|
|
886
|
+
orig: Optional[str] = None,
|
|
887
|
+
prov: Optional[ProvenanceItem] = None,
|
|
888
|
+
parent: Optional[GroupItem] = None,
|
|
889
|
+
):
|
|
890
|
+
"""add_paragraph.
|
|
891
|
+
|
|
892
|
+
:param label: str:
|
|
893
|
+
:param text: str:
|
|
894
|
+
:param orig: Optional[str]: (Default value = None)
|
|
895
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
896
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
897
|
+
|
|
898
|
+
"""
|
|
899
|
+
if not parent:
|
|
900
|
+
parent = self.body
|
|
901
|
+
|
|
902
|
+
if not orig:
|
|
903
|
+
orig = text
|
|
904
|
+
|
|
905
|
+
text_index = len(self.texts)
|
|
906
|
+
cref = f"#/texts/{text_index}"
|
|
907
|
+
text_item = TextItem(
|
|
908
|
+
label=label,
|
|
909
|
+
text=text,
|
|
910
|
+
orig=orig,
|
|
911
|
+
self_ref=cref,
|
|
912
|
+
parent=parent.get_ref(),
|
|
913
|
+
)
|
|
914
|
+
if prov:
|
|
915
|
+
text_item.prov.append(prov)
|
|
916
|
+
|
|
917
|
+
self.texts.append(text_item)
|
|
918
|
+
parent.children.append(RefItem(cref=cref))
|
|
919
|
+
|
|
920
|
+
return text_item
|
|
921
|
+
|
|
922
|
+
def add_table(
|
|
923
|
+
self,
|
|
924
|
+
data: TableData,
|
|
925
|
+
caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
|
|
926
|
+
prov: Optional[ProvenanceItem] = None,
|
|
927
|
+
parent: Optional[GroupItem] = None,
|
|
928
|
+
):
|
|
929
|
+
"""add_table.
|
|
930
|
+
|
|
931
|
+
:param data: BaseTableData:
|
|
932
|
+
:param caption: Optional[Union[TextItem:
|
|
933
|
+
:param RefItem]]: (Default value = None)
|
|
934
|
+
:param # This is not cool yet.prov: Optional[ProvenanceItem]
|
|
935
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
936
|
+
|
|
937
|
+
"""
|
|
938
|
+
if not parent:
|
|
939
|
+
parent = self.body
|
|
940
|
+
|
|
941
|
+
table_index = len(self.tables)
|
|
942
|
+
cref = f"#/tables/{table_index}"
|
|
943
|
+
|
|
944
|
+
tbl_item = TableItem(
|
|
945
|
+
label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
|
|
946
|
+
)
|
|
947
|
+
if prov:
|
|
948
|
+
tbl_item.prov.append(prov)
|
|
949
|
+
if caption:
|
|
950
|
+
tbl_item.captions.append(caption.get_ref())
|
|
951
|
+
|
|
952
|
+
self.tables.append(tbl_item)
|
|
953
|
+
parent.children.append(RefItem(cref=cref))
|
|
954
|
+
|
|
955
|
+
return tbl_item
|
|
956
|
+
|
|
957
|
+
def add_picture(
|
|
958
|
+
self,
|
|
959
|
+
annotations: List[PictureDataType] = [],
|
|
960
|
+
image: Optional[ImageRef] = None,
|
|
961
|
+
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
962
|
+
prov: Optional[ProvenanceItem] = None,
|
|
963
|
+
parent: Optional[GroupItem] = None,
|
|
964
|
+
):
|
|
965
|
+
"""add_picture.
|
|
966
|
+
|
|
967
|
+
:param data: List[PictureData]: (Default value = [])
|
|
968
|
+
:param caption: Optional[Union[TextItem:
|
|
969
|
+
:param RefItem]]: (Default value = None)
|
|
970
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
971
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
972
|
+
|
|
973
|
+
"""
|
|
974
|
+
if not parent:
|
|
975
|
+
parent = self.body
|
|
976
|
+
|
|
977
|
+
picture_index = len(self.pictures)
|
|
978
|
+
cref = f"#/pictures/{picture_index}"
|
|
979
|
+
|
|
980
|
+
fig_item = PictureItem(
|
|
981
|
+
label=DocItemLabel.PICTURE,
|
|
982
|
+
annotations=annotations,
|
|
983
|
+
image=image,
|
|
984
|
+
self_ref=cref,
|
|
985
|
+
parent=parent.get_ref(),
|
|
986
|
+
)
|
|
987
|
+
if prov:
|
|
988
|
+
fig_item.prov.append(prov)
|
|
989
|
+
if caption:
|
|
990
|
+
fig_item.captions.append(caption.get_ref())
|
|
991
|
+
|
|
992
|
+
self.pictures.append(fig_item)
|
|
993
|
+
parent.children.append(RefItem(cref=cref))
|
|
994
|
+
|
|
995
|
+
return fig_item
|
|
996
|
+
|
|
997
|
+
def add_heading(
|
|
998
|
+
self,
|
|
999
|
+
text: str,
|
|
1000
|
+
orig: Optional[str] = None,
|
|
1001
|
+
level: LevelNumber = 1,
|
|
1002
|
+
prov: Optional[ProvenanceItem] = None,
|
|
1003
|
+
parent: Optional[GroupItem] = None,
|
|
1004
|
+
):
|
|
1005
|
+
"""add_heading.
|
|
1006
|
+
|
|
1007
|
+
:param label: DocItemLabel:
|
|
1008
|
+
:param text: str:
|
|
1009
|
+
:param orig: Optional[str]: (Default value = None)
|
|
1010
|
+
:param level: LevelNumber: (Default value = 1)
|
|
1011
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1012
|
+
:param parent: Optional[GroupItem]: (Default value = None)
|
|
1013
|
+
|
|
1014
|
+
"""
|
|
1015
|
+
if not parent:
|
|
1016
|
+
parent = self.body
|
|
1017
|
+
|
|
1018
|
+
if not orig:
|
|
1019
|
+
orig = text
|
|
1020
|
+
|
|
1021
|
+
text_index = len(self.texts)
|
|
1022
|
+
cref = f"#/texts/{text_index}"
|
|
1023
|
+
section_header_item = SectionHeaderItem(
|
|
1024
|
+
level=level,
|
|
1025
|
+
text=text,
|
|
1026
|
+
orig=orig,
|
|
1027
|
+
self_ref=cref,
|
|
1028
|
+
parent=parent.get_ref(),
|
|
1029
|
+
)
|
|
1030
|
+
if prov:
|
|
1031
|
+
section_header_item.prov.append(prov)
|
|
1032
|
+
|
|
1033
|
+
self.texts.append(section_header_item)
|
|
1034
|
+
parent.children.append(RefItem(cref=cref))
|
|
1035
|
+
|
|
1036
|
+
return section_header_item
|
|
1037
|
+
|
|
1038
|
+
def num_pages(self):
|
|
1039
|
+
"""num_pages."""
|
|
1040
|
+
return len(self.pages.values())
|
|
1041
|
+
|
|
1042
|
+
def validate_tree(self, root) -> bool:
|
|
1043
|
+
"""validate_tree."""
|
|
1044
|
+
res = []
|
|
1045
|
+
for child_ref in root.children:
|
|
1046
|
+
child = child_ref.resolve(self)
|
|
1047
|
+
if child.parent.resolve(self) != root:
|
|
1048
|
+
return False
|
|
1049
|
+
res.append(self.validate_tree(child))
|
|
1050
|
+
|
|
1051
|
+
return all(res) or len(res) == 0
|
|
1052
|
+
|
|
1053
|
+
def iterate_items(
|
|
1054
|
+
self,
|
|
1055
|
+
root: Optional[NodeItem] = None,
|
|
1056
|
+
with_groups: bool = False,
|
|
1057
|
+
traverse_pictures: bool = True,
|
|
1058
|
+
page_no: Optional[int] = None,
|
|
1059
|
+
_level: int = 0, # fixed parameter, carries through the node nesting level
|
|
1060
|
+
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
1061
|
+
"""iterate_elements.
|
|
1062
|
+
|
|
1063
|
+
:param root: Optional[NodeItem]: (Default value = None)
|
|
1064
|
+
:param with_groups: bool: (Default value = False)
|
|
1065
|
+
:param traverse_pictures: bool: (Default value = True)
|
|
1066
|
+
:param page_no: Optional[int]: (Default value = None)
|
|
1067
|
+
:param _level: (Default value = 0)
|
|
1068
|
+
:param # fixed parameter:
|
|
1069
|
+
:param carries through the node nesting level:
|
|
1070
|
+
"""
|
|
1071
|
+
if not root:
|
|
1072
|
+
root = self.body
|
|
1073
|
+
|
|
1074
|
+
if not isinstance(root, GroupItem) or with_groups:
|
|
1075
|
+
if isinstance(root, DocItem):
|
|
1076
|
+
if page_no is not None:
|
|
1077
|
+
for prov in root.prov:
|
|
1078
|
+
if prov.page_no == page_no:
|
|
1079
|
+
yield root, _level
|
|
1080
|
+
else:
|
|
1081
|
+
yield root, _level
|
|
1082
|
+
else:
|
|
1083
|
+
yield root, _level
|
|
1084
|
+
|
|
1085
|
+
# Traverse children
|
|
1086
|
+
for child_ref in root.children:
|
|
1087
|
+
child = child_ref.resolve(self)
|
|
1088
|
+
|
|
1089
|
+
if isinstance(child, NodeItem):
|
|
1090
|
+
# If the child is a NodeItem, recursively traverse it
|
|
1091
|
+
if not isinstance(child, PictureItem) or traverse_pictures:
|
|
1092
|
+
yield from self.iterate_items(
|
|
1093
|
+
child, _level=_level + 1, with_groups=with_groups
|
|
1094
|
+
)
|
|
1095
|
+
|
|
1096
|
+
def print_element_tree(self):
|
|
1097
|
+
"""print_element_tree."""
|
|
1098
|
+
for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
|
|
1099
|
+
if isinstance(item, GroupItem):
|
|
1100
|
+
print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
|
|
1101
|
+
elif isinstance(item, DocItem):
|
|
1102
|
+
print(" " * level, f"{ix}: {item.label.value}")
|
|
1103
|
+
|
|
1104
|
+
def export_to_dict(self) -> Dict:
|
|
1105
|
+
"""export_to_dict."""
|
|
1106
|
+
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
|
|
436
1107
|
|
|
437
1108
|
def export_to_markdown( # noqa: C901
|
|
438
1109
|
self,
|
|
439
1110
|
delim: str = "\n\n",
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
"title",
|
|
444
|
-
"subtitle-level-1",
|
|
445
|
-
"paragraph",
|
|
446
|
-
"caption",
|
|
447
|
-
"table",
|
|
448
|
-
"figure",
|
|
449
|
-
],
|
|
1111
|
+
from_element: int = 0,
|
|
1112
|
+
to_element: Optional[int] = None,
|
|
1113
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
450
1114
|
strict_text: bool = False,
|
|
451
1115
|
image_placeholder: str = "<!-- image -->",
|
|
452
1116
|
) -> str:
|
|
@@ -455,66 +1119,77 @@ class ExportedCCSDocument(
|
|
|
455
1119
|
Operates on a slice of the document's main_text as defined through arguments
|
|
456
1120
|
main_text_start and main_text_stop; defaulting to the whole main_text.
|
|
457
1121
|
|
|
458
|
-
|
|
459
|
-
delim (str, optional): Delimiter to use when concatenating the various
|
|
1122
|
+
:param delim: Delimiter to use when concatenating the various
|
|
460
1123
|
Markdown parts. Defaults to "\n\n".
|
|
461
|
-
|
|
1124
|
+
:type delim: str
|
|
1125
|
+
:param from_element: Body slicing start index (inclusive).
|
|
462
1126
|
Defaults to 0.
|
|
463
|
-
|
|
1127
|
+
:type from_element: int
|
|
1128
|
+
:param to_element: Body slicing stop index
|
|
464
1129
|
(exclusive). Defaults to None.
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
1130
|
+
:type to_element: Optional[int]
|
|
1131
|
+
:param delim: str: (Default value = "\n\n")
|
|
1132
|
+
:param from_element: int: (Default value = 0)
|
|
1133
|
+
:param to_element: Optional[int]: (Default value = None)
|
|
1134
|
+
:param labels: set[DocItemLabel]
|
|
1135
|
+
:param "subtitle-level-1":
|
|
1136
|
+
:param "paragraph":
|
|
1137
|
+
:param "caption":
|
|
1138
|
+
:param "table":
|
|
1139
|
+
:param "Text":
|
|
1140
|
+
:param "text":
|
|
1141
|
+
:param ]:
|
|
1142
|
+
:param strict_text: bool: (Default value = False)
|
|
1143
|
+
:param image_placeholder str: (Default value = "<!-- image -->")
|
|
1144
|
+
the placeholder to include to position images in the markdown.
|
|
1145
|
+
:returns: The exported Markdown representation.
|
|
1146
|
+
:rtype: str
|
|
474
1147
|
"""
|
|
475
1148
|
has_title = False
|
|
476
1149
|
prev_text = ""
|
|
477
1150
|
md_texts: list[str] = []
|
|
478
1151
|
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
)
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
1152
|
+
# collect all captions embedded in table and figure objects
|
|
1153
|
+
# to avoid repeating them
|
|
1154
|
+
embedded_captions = set()
|
|
1155
|
+
skip_count = 0
|
|
1156
|
+
for ix, (item, level) in enumerate(self.iterate_items(self.body)):
|
|
1157
|
+
if skip_count < from_element:
|
|
1158
|
+
skip_count += 1
|
|
1159
|
+
continue # skip as many items as you want
|
|
1160
|
+
|
|
1161
|
+
if to_element and ix >= to_element:
|
|
1162
|
+
break
|
|
1163
|
+
|
|
1164
|
+
if (
|
|
1165
|
+
isinstance(item, (TableItem, PictureItem))
|
|
1166
|
+
and len(item.captions) > 0
|
|
1167
|
+
and item.label in labels
|
|
1168
|
+
):
|
|
1169
|
+
caption = item.caption_text(self)
|
|
1170
|
+
if caption:
|
|
1171
|
+
embedded_captions.add(caption)
|
|
1172
|
+
|
|
1173
|
+
skip_count = 0
|
|
1174
|
+
for ix, (item, level) in enumerate(self.iterate_items(self.body)):
|
|
1175
|
+
if skip_count < from_element:
|
|
1176
|
+
skip_count += 1
|
|
1177
|
+
continue # skip as many items as you want
|
|
1178
|
+
|
|
1179
|
+
if to_element and ix >= to_element:
|
|
1180
|
+
break
|
|
1181
|
+
|
|
1182
|
+
markdown_text = ""
|
|
1183
|
+
|
|
1184
|
+
if isinstance(item, DocItem):
|
|
1185
|
+
item_type = item.label
|
|
1186
|
+
|
|
1187
|
+
if isinstance(item, TextItem) and item_type in labels:
|
|
513
1188
|
text = item.text
|
|
514
1189
|
|
|
515
1190
|
# skip captions of they are embedded in the actual
|
|
516
1191
|
# floating object
|
|
517
|
-
if item_type ==
|
|
1192
|
+
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
|
518
1193
|
continue
|
|
519
1194
|
|
|
520
1195
|
# ignore repeated text
|
|
@@ -524,7 +1199,7 @@ class ExportedCCSDocument(
|
|
|
524
1199
|
prev_text = text
|
|
525
1200
|
|
|
526
1201
|
# first title match
|
|
527
|
-
if item_type ==
|
|
1202
|
+
if item_type == DocItemLabel.TITLE and not has_title:
|
|
528
1203
|
if strict_text:
|
|
529
1204
|
markdown_text = f"{text}"
|
|
530
1205
|
else:
|
|
@@ -532,78 +1207,89 @@ class ExportedCCSDocument(
|
|
|
532
1207
|
has_title = True
|
|
533
1208
|
|
|
534
1209
|
# secondary titles
|
|
535
|
-
elif item_type in {
|
|
536
|
-
|
|
537
|
-
|
|
1210
|
+
elif item_type in {
|
|
1211
|
+
DocItemLabel.TITLE,
|
|
1212
|
+
DocItemLabel.SECTION_HEADER,
|
|
1213
|
+
} or (has_title and item_type == DocItemLabel.TITLE):
|
|
538
1214
|
if strict_text:
|
|
539
1215
|
markdown_text = f"{text}"
|
|
540
1216
|
else:
|
|
541
1217
|
markdown_text = f"## {text}"
|
|
542
1218
|
|
|
1219
|
+
# secondary titles
|
|
1220
|
+
elif isinstance(item, ListItem):
|
|
1221
|
+
if item.enumerated:
|
|
1222
|
+
marker = item.marker
|
|
1223
|
+
else:
|
|
1224
|
+
marker = "-"
|
|
1225
|
+
|
|
1226
|
+
markdown_text = f"{marker} {text}"
|
|
1227
|
+
|
|
543
1228
|
# normal text
|
|
544
1229
|
else:
|
|
545
1230
|
markdown_text = text
|
|
546
1231
|
|
|
547
|
-
elif (
|
|
548
|
-
|
|
549
|
-
and item.data
|
|
550
|
-
and item_type in main_text_labels
|
|
551
|
-
):
|
|
1232
|
+
elif isinstance(item, TableItem) and item.data and item_type in labels:
|
|
1233
|
+
parts = []
|
|
552
1234
|
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
table.append(tmp)
|
|
560
|
-
|
|
561
|
-
if len(table) > 1 and len(table[0]) > 0:
|
|
562
|
-
try:
|
|
563
|
-
md_table = tabulate(
|
|
564
|
-
table[1:], headers=table[0], tablefmt="github"
|
|
565
|
-
)
|
|
566
|
-
except ValueError:
|
|
567
|
-
md_table = tabulate(
|
|
568
|
-
table[1:],
|
|
569
|
-
headers=table[0],
|
|
570
|
-
tablefmt="github",
|
|
571
|
-
disable_numparse=True,
|
|
572
|
-
)
|
|
573
|
-
|
|
574
|
-
markdown_text = ""
|
|
575
|
-
if item.text:
|
|
576
|
-
markdown_text = item.text
|
|
1235
|
+
# Compute the caption
|
|
1236
|
+
if caption := item.caption_text(self):
|
|
1237
|
+
parts.append(caption)
|
|
1238
|
+
parts.append("\n")
|
|
1239
|
+
|
|
1240
|
+
# Rendered the item
|
|
577
1241
|
if not strict_text:
|
|
578
|
-
|
|
1242
|
+
md_table = item.export_to_markdown()
|
|
1243
|
+
if md_table:
|
|
1244
|
+
parts.append(item.export_to_markdown())
|
|
1245
|
+
|
|
1246
|
+
# Combine parts
|
|
1247
|
+
markdown_text = "\n".join(parts)
|
|
579
1248
|
|
|
580
|
-
elif isinstance(item,
|
|
1249
|
+
elif isinstance(item, PictureItem) and item_type in labels:
|
|
1250
|
+
parts = []
|
|
581
1251
|
|
|
582
|
-
|
|
583
|
-
if item.
|
|
584
|
-
|
|
1252
|
+
# Compute the caption
|
|
1253
|
+
if caption := item.caption_text(self):
|
|
1254
|
+
parts.append(caption)
|
|
1255
|
+
parts.append("\n")
|
|
1256
|
+
|
|
1257
|
+
# Rendered the item
|
|
585
1258
|
if not strict_text:
|
|
586
|
-
|
|
1259
|
+
parts.append(f"{image_placeholder}")
|
|
1260
|
+
|
|
1261
|
+
# Combine parts
|
|
1262
|
+
markdown_text = "\n".join(parts)
|
|
587
1263
|
|
|
588
|
-
|
|
589
|
-
|
|
1264
|
+
if markdown_text:
|
|
1265
|
+
md_texts.append(markdown_text)
|
|
590
1266
|
|
|
591
1267
|
result = delim.join(md_texts)
|
|
592
1268
|
return result
|
|
593
1269
|
|
|
1270
|
+
def export_to_text( # noqa: C901
|
|
1271
|
+
self,
|
|
1272
|
+
delim: str = "\n\n",
|
|
1273
|
+
from_element: int = 0,
|
|
1274
|
+
to_element: Optional[int] = None,
|
|
1275
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
1276
|
+
) -> str:
|
|
1277
|
+
"""export_to_text."""
|
|
1278
|
+
return self.export_to_markdown(
|
|
1279
|
+
delim,
|
|
1280
|
+
from_element,
|
|
1281
|
+
to_element,
|
|
1282
|
+
labels,
|
|
1283
|
+
strict_text=True,
|
|
1284
|
+
image_placeholder="",
|
|
1285
|
+
)
|
|
1286
|
+
|
|
594
1287
|
def export_to_document_tokens(
|
|
595
1288
|
self,
|
|
596
1289
|
delim: str = "\n\n",
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
"title",
|
|
601
|
-
"subtitle-level-1",
|
|
602
|
-
"paragraph",
|
|
603
|
-
"caption",
|
|
604
|
-
"table",
|
|
605
|
-
"figure",
|
|
606
|
-
],
|
|
1290
|
+
from_element: int = 0,
|
|
1291
|
+
to_element: Optional[int] = None,
|
|
1292
|
+
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
607
1293
|
xsize: int = 100,
|
|
608
1294
|
ysize: int = 100,
|
|
609
1295
|
add_location: bool = True,
|
|
@@ -616,11 +1302,23 @@ class ExportedCCSDocument(
|
|
|
616
1302
|
) -> str:
|
|
617
1303
|
r"""Exports the document content to an DocumentToken format.
|
|
618
1304
|
|
|
619
|
-
Operates on a slice of the document's
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
1305
|
+
Operates on a slice of the document's body as defined through arguments
|
|
1306
|
+
from_element and to_element; defaulting to the whole main_text.
|
|
1307
|
+
|
|
1308
|
+
:param delim: str: (Default value = "\n\n")
|
|
1309
|
+
:param from_element: int: (Default value = 0)
|
|
1310
|
+
:param to_element: Optional[int]: (Default value = None)
|
|
1311
|
+
:param labels: set[DocItemLabel]
|
|
1312
|
+
:param xsize: int: (Default value = 100)
|
|
1313
|
+
:param ysize: int: (Default value = 100)
|
|
1314
|
+
:param add_location: bool: (Default value = True)
|
|
1315
|
+
:param add_content: bool: (Default value = True)
|
|
1316
|
+
:param add_page_index: bool: (Default value = True)
|
|
1317
|
+
:param # table specific flagsadd_table_cell_location: bool
|
|
1318
|
+
:param add_table_cell_label: bool: (Default value = True)
|
|
1319
|
+
:param add_table_cell_text: bool: (Default value = True)
|
|
1320
|
+
:returns: The content of the document formatted as a DocTags string.
|
|
1321
|
+
:rtype: str
|
|
624
1322
|
"""
|
|
625
1323
|
new_line = ""
|
|
626
1324
|
if delim:
|
|
@@ -630,82 +1328,113 @@ class ExportedCCSDocument(
|
|
|
630
1328
|
|
|
631
1329
|
# pagedims = self.get_map_to_page_dimensions()
|
|
632
1330
|
|
|
633
|
-
|
|
634
|
-
|
|
1331
|
+
skip_count = 0
|
|
1332
|
+
for ix, (item, level) in enumerate(self.iterate_items(self.body)):
|
|
1333
|
+
if skip_count < from_element:
|
|
1334
|
+
skip_count += 1
|
|
1335
|
+
continue # skip as many items as you want
|
|
635
1336
|
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
if isinstance(orig_item, Ref)
|
|
639
|
-
else orig_item
|
|
640
|
-
)
|
|
1337
|
+
if to_element and ix >= to_element:
|
|
1338
|
+
break
|
|
641
1339
|
|
|
642
|
-
|
|
643
|
-
|
|
1340
|
+
if not isinstance(item, DocItem):
|
|
1341
|
+
continue
|
|
644
1342
|
|
|
645
|
-
|
|
1343
|
+
prov = item.prov
|
|
646
1344
|
|
|
647
|
-
|
|
648
|
-
page_w = 0.0
|
|
649
|
-
page_h = 0.0
|
|
1345
|
+
page_i = -1
|
|
650
1346
|
|
|
651
|
-
|
|
652
|
-
add_location
|
|
653
|
-
and self.page_dimensions is not None
|
|
654
|
-
and prov is not None
|
|
655
|
-
and len(prov) > 0
|
|
656
|
-
):
|
|
1347
|
+
if add_location and len(self.pages) and len(prov) > 0:
|
|
657
1348
|
|
|
658
|
-
|
|
659
|
-
|
|
1349
|
+
page_i = prov[0].page_no
|
|
1350
|
+
page_dim = self.pages[page_i].size
|
|
660
1351
|
|
|
661
|
-
|
|
662
|
-
|
|
1352
|
+
float(page_dim.width)
|
|
1353
|
+
float(page_dim.height)
|
|
663
1354
|
|
|
664
|
-
|
|
665
|
-
|
|
1355
|
+
item_type = item.label
|
|
1356
|
+
if isinstance(item, TextItem) and (item_type in labels):
|
|
666
1357
|
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
)
|
|
1358
|
+
doctags += item.export_to_document_tokens(
|
|
1359
|
+
doc=self,
|
|
1360
|
+
new_line=new_line,
|
|
1361
|
+
xsize=xsize,
|
|
1362
|
+
ysize=ysize,
|
|
1363
|
+
add_location=add_location,
|
|
1364
|
+
add_content=add_content,
|
|
1365
|
+
add_page_index=add_page_index,
|
|
1366
|
+
)
|
|
677
1367
|
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
)
|
|
1368
|
+
elif isinstance(item, TableItem) and (item_type in labels):
|
|
1369
|
+
|
|
1370
|
+
doctags += item.export_to_document_tokens(
|
|
1371
|
+
doc=self,
|
|
1372
|
+
new_line=new_line,
|
|
1373
|
+
xsize=xsize,
|
|
1374
|
+
ysize=ysize,
|
|
1375
|
+
add_caption=True,
|
|
1376
|
+
add_location=add_location,
|
|
1377
|
+
add_content=add_content,
|
|
1378
|
+
add_cell_location=add_table_cell_location,
|
|
1379
|
+
add_cell_label=add_table_cell_label,
|
|
1380
|
+
add_cell_text=add_table_cell_text,
|
|
1381
|
+
add_page_index=add_page_index,
|
|
1382
|
+
)
|
|
694
1383
|
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
)
|
|
1384
|
+
elif isinstance(item, PictureItem) and (item_type in labels):
|
|
1385
|
+
|
|
1386
|
+
doctags += item.export_to_document_tokens(
|
|
1387
|
+
doc=self,
|
|
1388
|
+
new_line=new_line,
|
|
1389
|
+
xsize=xsize,
|
|
1390
|
+
ysize=ysize,
|
|
1391
|
+
add_caption=True,
|
|
1392
|
+
add_location=add_location,
|
|
1393
|
+
add_content=add_content,
|
|
1394
|
+
add_page_index=add_page_index,
|
|
1395
|
+
)
|
|
708
1396
|
|
|
709
1397
|
doctags += DocumentToken.END_DOCUMENT.value
|
|
710
1398
|
|
|
711
1399
|
return doctags
|
|
1400
|
+
|
|
1401
|
+
def add_page(
|
|
1402
|
+
self, page_no: int, size: Size, image: Optional[ImageRef] = None
|
|
1403
|
+
) -> PageItem:
|
|
1404
|
+
"""add_page.
|
|
1405
|
+
|
|
1406
|
+
:param page_no: int:
|
|
1407
|
+
:param size: Size:
|
|
1408
|
+
|
|
1409
|
+
"""
|
|
1410
|
+
pitem = PageItem(page_no=page_no, size=size, image=image)
|
|
1411
|
+
|
|
1412
|
+
self.pages[page_no] = pitem
|
|
1413
|
+
return pitem
|
|
1414
|
+
|
|
1415
|
+
@field_validator("version")
|
|
1416
|
+
@classmethod
|
|
1417
|
+
def check_version_is_compatible(cls, v: str) -> str:
|
|
1418
|
+
"""Check if this document version is compatible with current version."""
|
|
1419
|
+
current_match = re.match(VERSION_PATTERN, CURRENT_VERSION)
|
|
1420
|
+
doc_match = re.match(VERSION_PATTERN, v)
|
|
1421
|
+
if (
|
|
1422
|
+
doc_match is None
|
|
1423
|
+
or current_match is None
|
|
1424
|
+
or doc_match["major"] != current_match["major"]
|
|
1425
|
+
or doc_match["minor"] > current_match["minor"]
|
|
1426
|
+
):
|
|
1427
|
+
raise ValueError(
|
|
1428
|
+
f"incompatible version {v} with schema version {CURRENT_VERSION}"
|
|
1429
|
+
)
|
|
1430
|
+
else:
|
|
1431
|
+
return CURRENT_VERSION
|
|
1432
|
+
|
|
1433
|
+
@model_validator(mode="after") # type: ignore
|
|
1434
|
+
@classmethod
|
|
1435
|
+
def validate_document(cls, d: "DoclingDocument"):
|
|
1436
|
+
"""validate_document."""
|
|
1437
|
+
if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
|
|
1438
|
+
raise ValueError("Document hierachy is inconsistent.")
|
|
1439
|
+
|
|
1440
|
+
return d
|