docling-core 1.7.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/__init__.py +2 -8
- docling_core/transforms/chunker/base.py +27 -40
- docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
- docling_core/types/__init__.py +12 -8
- docling_core/types/doc/__init__.py +25 -0
- docling_core/types/doc/base.py +136 -451
- docling_core/types/doc/document.py +1288 -559
- docling_core/types/{experimental → doc}/labels.py +4 -1
- docling_core/types/legacy_doc/__init__.py +6 -0
- docling_core/types/legacy_doc/base.py +485 -0
- docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
- docling_core/types/legacy_doc/document.py +715 -0
- docling_core/types/rec/subject.py +1 -1
- docling_core/utils/generate_docs.py +82 -0
- docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
- docling_core/utils/validators.py +3 -3
- {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/METADATA +11 -11
- {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/RECORD +24 -31
- docling_core-2.0.0.dist-info/entry_points.txt +5 -0
- docling_core/transforms/id_generator/__init__.py +0 -12
- docling_core/transforms/id_generator/base.py +0 -30
- docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
- docling_core/transforms/id_generator/uuid_generator.py +0 -34
- docling_core/transforms/metadata_extractor/__init__.py +0 -13
- docling_core/transforms/metadata_extractor/base.py +0 -59
- docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
- docling_core/types/experimental/__init__.py +0 -30
- docling_core/types/experimental/base.py +0 -167
- docling_core/types/experimental/document.py +0 -1192
- docling_core/utils/ds_generate_docs.py +0 -144
- docling_core-1.7.1.dist-info/entry_points.txt +0 -5
- /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
- {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/LICENSE +0 -0
- {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/WHEEL +0 -0
|
@@ -1,1192 +0,0 @@
|
|
|
1
|
-
"""Models for the Docling Document data type."""
|
|
2
|
-
|
|
3
|
-
import mimetypes
|
|
4
|
-
import re
|
|
5
|
-
import typing
|
|
6
|
-
from typing import Any, Dict, Final, List, Optional, Tuple, Union
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
9
|
-
from pydantic import (
|
|
10
|
-
AnyUrl,
|
|
11
|
-
BaseModel,
|
|
12
|
-
ConfigDict,
|
|
13
|
-
Field,
|
|
14
|
-
StringConstraints,
|
|
15
|
-
computed_field,
|
|
16
|
-
field_validator,
|
|
17
|
-
model_validator,
|
|
18
|
-
)
|
|
19
|
-
from tabulate import tabulate
|
|
20
|
-
from typing_extensions import Annotated
|
|
21
|
-
|
|
22
|
-
from docling_core.search.package import VERSION_PATTERN
|
|
23
|
-
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
24
|
-
from docling_core.types.doc.tokens import DocumentToken
|
|
25
|
-
from docling_core.types.experimental import BoundingBox, Size
|
|
26
|
-
from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
|
|
27
|
-
|
|
28
|
-
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
29
|
-
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
30
|
-
CURRENT_VERSION: Final = "1.0.0"
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class BasePictureData(BaseModel): # TBD
|
|
34
|
-
"""BasePictureData."""
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class TableCell(BaseModel):
|
|
38
|
-
"""TableCell."""
|
|
39
|
-
|
|
40
|
-
bbox: Optional[BoundingBox] = None
|
|
41
|
-
row_span: int = 1
|
|
42
|
-
col_span: int = 1
|
|
43
|
-
start_row_offset_idx: int
|
|
44
|
-
end_row_offset_idx: int
|
|
45
|
-
start_col_offset_idx: int
|
|
46
|
-
end_col_offset_idx: int
|
|
47
|
-
text: str
|
|
48
|
-
column_header: bool = False
|
|
49
|
-
row_header: bool = False
|
|
50
|
-
row_section: bool = False
|
|
51
|
-
|
|
52
|
-
@model_validator(mode="before")
|
|
53
|
-
@classmethod
|
|
54
|
-
def from_dict_format(cls, data: Any) -> Any:
|
|
55
|
-
"""from_dict_format."""
|
|
56
|
-
if isinstance(data, Dict):
|
|
57
|
-
if "bbox" not in data or data["bbox"] is None:
|
|
58
|
-
return data
|
|
59
|
-
text = data["bbox"].get("token", "")
|
|
60
|
-
if not len(text):
|
|
61
|
-
text_cells = data.pop("text_cell_bboxes", None)
|
|
62
|
-
if text_cells:
|
|
63
|
-
for el in text_cells:
|
|
64
|
-
text += el["token"] + " "
|
|
65
|
-
|
|
66
|
-
text = text.strip()
|
|
67
|
-
data["text"] = text
|
|
68
|
-
|
|
69
|
-
return data
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
class BaseTableData(BaseModel): # TBD
|
|
73
|
-
"""BaseTableData."""
|
|
74
|
-
|
|
75
|
-
table_cells: List[TableCell] = []
|
|
76
|
-
num_rows: int = 0
|
|
77
|
-
num_cols: int = 0
|
|
78
|
-
|
|
79
|
-
@computed_field # type: ignore
|
|
80
|
-
@property
|
|
81
|
-
def grid(
|
|
82
|
-
self,
|
|
83
|
-
) -> List[List[TableCell]]:
|
|
84
|
-
"""grid."""
|
|
85
|
-
# Initialise empty table data grid (only empty cells)
|
|
86
|
-
table_data = [
|
|
87
|
-
[
|
|
88
|
-
TableCell(
|
|
89
|
-
text="",
|
|
90
|
-
start_row_offset_idx=i,
|
|
91
|
-
end_row_offset_idx=i + 1,
|
|
92
|
-
start_col_offset_idx=j,
|
|
93
|
-
end_col_offset_idx=j + 1,
|
|
94
|
-
)
|
|
95
|
-
for j in range(self.num_cols)
|
|
96
|
-
]
|
|
97
|
-
for i in range(self.num_rows)
|
|
98
|
-
]
|
|
99
|
-
|
|
100
|
-
# Overwrite cells in table data for which there is actual cell content.
|
|
101
|
-
for cell in self.table_cells:
|
|
102
|
-
for i in range(
|
|
103
|
-
min(cell.start_row_offset_idx, self.num_rows),
|
|
104
|
-
min(cell.end_row_offset_idx, self.num_rows),
|
|
105
|
-
):
|
|
106
|
-
for j in range(
|
|
107
|
-
min(cell.start_col_offset_idx, self.num_cols),
|
|
108
|
-
min(cell.end_col_offset_idx, self.num_cols),
|
|
109
|
-
):
|
|
110
|
-
table_data[i][j] = cell
|
|
111
|
-
|
|
112
|
-
return table_data
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
class DocumentOrigin(BaseModel):
|
|
116
|
-
"""FileSource."""
|
|
117
|
-
|
|
118
|
-
mimetype: str # the mimetype of the original file
|
|
119
|
-
binary_hash: Uint64 # the binary hash of the original file.
|
|
120
|
-
# TODO: Change to be Uint64 and provide utility method to generate
|
|
121
|
-
|
|
122
|
-
filename: str # The name of the original file, including extension, without path.
|
|
123
|
-
# Could stem from filesystem, source URI, Content-Disposition header, ...
|
|
124
|
-
|
|
125
|
-
uri: Optional[AnyUrl] = (
|
|
126
|
-
None # any possible reference to a source file,
|
|
127
|
-
# from any file handler protocol (e.g. https://, file://, s3://)
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
@field_validator("binary_hash", mode="before")
|
|
131
|
-
@classmethod
|
|
132
|
-
def parse_hex_string(cls, value):
|
|
133
|
-
"""parse_hex_string."""
|
|
134
|
-
if isinstance(value, str):
|
|
135
|
-
try:
|
|
136
|
-
# Convert hex string to an integer
|
|
137
|
-
hash_int = Uint64(value, 16)
|
|
138
|
-
# Mask to fit within 64 bits (unsigned)
|
|
139
|
-
return (
|
|
140
|
-
hash_int & 0xFFFFFFFFFFFFFFFF
|
|
141
|
-
) # TODO be sure it doesn't clip uint64 max
|
|
142
|
-
except ValueError:
|
|
143
|
-
raise ValueError(f"Invalid sha256 hexdigest: {value}")
|
|
144
|
-
return value # If already an int, return it as is.
|
|
145
|
-
|
|
146
|
-
@field_validator("mimetype")
|
|
147
|
-
@classmethod
|
|
148
|
-
def validate_mimetype(cls, v):
|
|
149
|
-
"""validate_mimetype."""
|
|
150
|
-
# Check if the provided MIME type is valid using mimetypes module
|
|
151
|
-
if v not in mimetypes.types_map.values():
|
|
152
|
-
raise ValueError(f"'{v}' is not a valid MIME type")
|
|
153
|
-
return v
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
class RefItem(BaseModel):
|
|
157
|
-
"""RefItem."""
|
|
158
|
-
|
|
159
|
-
cref: str = Field(alias="$ref", pattern=_JSON_POINTER_REGEX)
|
|
160
|
-
|
|
161
|
-
# This method makes RefItem compatible with DocItem
|
|
162
|
-
def get_ref(self):
|
|
163
|
-
"""get_ref."""
|
|
164
|
-
return self
|
|
165
|
-
|
|
166
|
-
model_config = ConfigDict(
|
|
167
|
-
populate_by_name=True,
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
def resolve(self, doc: "DoclingDocument"):
|
|
171
|
-
"""resolve."""
|
|
172
|
-
path_components = self.cref.split("/")
|
|
173
|
-
if (num_comps := len(path_components)) == 3:
|
|
174
|
-
_, path, index_str = path_components
|
|
175
|
-
index = int(index_str)
|
|
176
|
-
obj = doc.__getattribute__(path)[index]
|
|
177
|
-
elif num_comps == 2:
|
|
178
|
-
_, path = path_components
|
|
179
|
-
obj = doc.__getattribute__(path)
|
|
180
|
-
else:
|
|
181
|
-
raise RuntimeError(f"Unsupported number of path components: {num_comps}")
|
|
182
|
-
return obj
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
class ImageRef(BaseModel):
|
|
186
|
-
"""ImageRef."""
|
|
187
|
-
|
|
188
|
-
mimetype: str
|
|
189
|
-
dpi: int
|
|
190
|
-
size: Size
|
|
191
|
-
uri: AnyUrl
|
|
192
|
-
|
|
193
|
-
@field_validator("mimetype")
|
|
194
|
-
@classmethod
|
|
195
|
-
def validate_mimetype(cls, v):
|
|
196
|
-
"""validate_mimetype."""
|
|
197
|
-
# Check if the provided MIME type is valid using mimetypes module
|
|
198
|
-
if v not in mimetypes.types_map.values():
|
|
199
|
-
raise ValueError(f"'{v}' is not a valid MIME type")
|
|
200
|
-
return v
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
class ProvenanceItem(BaseModel):
|
|
204
|
-
"""ProvenanceItem."""
|
|
205
|
-
|
|
206
|
-
page_no: int
|
|
207
|
-
bbox: BoundingBox
|
|
208
|
-
charspan: Tuple[int, int]
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
class NodeItem(BaseModel):
|
|
212
|
-
"""NodeItem."""
|
|
213
|
-
|
|
214
|
-
self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
|
|
215
|
-
parent: Optional[RefItem] = None
|
|
216
|
-
children: List[RefItem] = []
|
|
217
|
-
|
|
218
|
-
model_config = ConfigDict(extra="forbid")
|
|
219
|
-
|
|
220
|
-
def get_ref(self):
|
|
221
|
-
"""get_ref."""
|
|
222
|
-
return RefItem(cref=self.self_ref)
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
class GroupItem(NodeItem): # Container type, can't be a leaf node
|
|
226
|
-
"""GroupItem."""
|
|
227
|
-
|
|
228
|
-
name: str = (
|
|
229
|
-
"group" # Name of the group, e.g. "Introduction Chapter",
|
|
230
|
-
# "Slide 5", "Navigation menu list", ...
|
|
231
|
-
)
|
|
232
|
-
label: GroupLabel = GroupLabel.UNSPECIFIED
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
class DocItem(
|
|
236
|
-
NodeItem
|
|
237
|
-
): # Base type for any element that carries content, can be a leaf node
|
|
238
|
-
"""DocItem."""
|
|
239
|
-
|
|
240
|
-
label: DocItemLabel
|
|
241
|
-
prov: List[ProvenanceItem] = []
|
|
242
|
-
|
|
243
|
-
def get_location_tokens(
|
|
244
|
-
self,
|
|
245
|
-
doc: "DoclingDocument",
|
|
246
|
-
new_line: str,
|
|
247
|
-
xsize: int = 100,
|
|
248
|
-
ysize: int = 100,
|
|
249
|
-
add_page_index: bool = True,
|
|
250
|
-
) -> str:
|
|
251
|
-
"""Get the location string for the BaseCell."""
|
|
252
|
-
if not len(self.prov):
|
|
253
|
-
return ""
|
|
254
|
-
|
|
255
|
-
location = ""
|
|
256
|
-
for prov in self.prov:
|
|
257
|
-
page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
|
|
258
|
-
|
|
259
|
-
page_i = -1
|
|
260
|
-
if add_page_index:
|
|
261
|
-
page_i = prov.page_no
|
|
262
|
-
|
|
263
|
-
loc_str = DocumentToken.get_location(
|
|
264
|
-
bbox=prov.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
265
|
-
page_w=page_w,
|
|
266
|
-
page_h=page_h,
|
|
267
|
-
xsize=xsize,
|
|
268
|
-
ysize=ysize,
|
|
269
|
-
page_i=page_i,
|
|
270
|
-
)
|
|
271
|
-
location += f"{loc_str}{new_line}"
|
|
272
|
-
|
|
273
|
-
return location
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
class TextItem(DocItem):
|
|
277
|
-
"""TextItem."""
|
|
278
|
-
|
|
279
|
-
orig: str # untreated representation
|
|
280
|
-
text: str # sanitized representation
|
|
281
|
-
|
|
282
|
-
def export_to_document_tokens(
|
|
283
|
-
self,
|
|
284
|
-
doc: "DoclingDocument",
|
|
285
|
-
new_line: str = "\n",
|
|
286
|
-
xsize: int = 100,
|
|
287
|
-
ysize: int = 100,
|
|
288
|
-
add_location: bool = True,
|
|
289
|
-
add_content: bool = True,
|
|
290
|
-
add_page_index: bool = True,
|
|
291
|
-
):
|
|
292
|
-
r"""Export text element to document tokens format.
|
|
293
|
-
|
|
294
|
-
:param doc: "DoclingDocument":
|
|
295
|
-
:param new_line: str: (Default value = "\n")
|
|
296
|
-
:param xsize: int: (Default value = 100)
|
|
297
|
-
:param ysize: int: (Default value = 100)
|
|
298
|
-
:param add_location: bool: (Default value = True)
|
|
299
|
-
:param add_content: bool: (Default value = True)
|
|
300
|
-
:param add_page_index: bool: (Default value = True)
|
|
301
|
-
|
|
302
|
-
"""
|
|
303
|
-
body = f"<{self.label.value}>"
|
|
304
|
-
|
|
305
|
-
# TODO: This must be done through an explicit mapping.
|
|
306
|
-
# assert DocumentToken.is_known_token(
|
|
307
|
-
# body
|
|
308
|
-
# ), f"failed DocumentToken.is_known_token({body})"
|
|
309
|
-
|
|
310
|
-
if add_location:
|
|
311
|
-
body += self.get_location_tokens(
|
|
312
|
-
doc=doc,
|
|
313
|
-
new_line="",
|
|
314
|
-
xsize=xsize,
|
|
315
|
-
ysize=ysize,
|
|
316
|
-
add_page_index=add_page_index,
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
if add_content and self.text is not None:
|
|
320
|
-
body += self.text.strip()
|
|
321
|
-
|
|
322
|
-
body += f"</{self.label.value}>{new_line}"
|
|
323
|
-
|
|
324
|
-
return body
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
class SectionHeaderItem(TextItem):
|
|
328
|
-
"""SectionItem."""
|
|
329
|
-
|
|
330
|
-
label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
|
|
331
|
-
level: LevelNumber
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
class FloatingItem(DocItem):
|
|
335
|
-
"""FloatingItem."""
|
|
336
|
-
|
|
337
|
-
captions: List[RefItem] = []
|
|
338
|
-
references: List[RefItem] = []
|
|
339
|
-
footnotes: List[RefItem] = []
|
|
340
|
-
image: Optional[ImageRef] = None
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
class PictureItem(FloatingItem):
|
|
344
|
-
"""PictureItem."""
|
|
345
|
-
|
|
346
|
-
label: typing.Literal[DocItemLabel.PICTURE] = DocItemLabel.PICTURE
|
|
347
|
-
|
|
348
|
-
data: BasePictureData
|
|
349
|
-
|
|
350
|
-
def export_to_document_tokens(
|
|
351
|
-
self,
|
|
352
|
-
doc: "DoclingDocument",
|
|
353
|
-
new_line: str = "\n",
|
|
354
|
-
xsize: int = 100,
|
|
355
|
-
ysize: int = 100,
|
|
356
|
-
add_location: bool = True,
|
|
357
|
-
add_caption: bool = True,
|
|
358
|
-
add_content: bool = True, # not used at the moment
|
|
359
|
-
add_page_index: bool = True,
|
|
360
|
-
):
|
|
361
|
-
r"""Export picture to document tokens format.
|
|
362
|
-
|
|
363
|
-
:param doc: "DoclingDocument":
|
|
364
|
-
:param new_line: str: (Default value = "\n")
|
|
365
|
-
:param xsize: int: (Default value = 100)
|
|
366
|
-
:param ysize: int: (Default value = 100)
|
|
367
|
-
:param add_location: bool: (Default value = True)
|
|
368
|
-
:param add_caption: bool: (Default value = True)
|
|
369
|
-
:param add_content: bool: (Default value = True)
|
|
370
|
-
:param # not used at the momentadd_page_index: bool: (Default value = True)
|
|
371
|
-
|
|
372
|
-
"""
|
|
373
|
-
body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
|
|
374
|
-
|
|
375
|
-
if add_location:
|
|
376
|
-
body += self.get_location_tokens(
|
|
377
|
-
doc=doc,
|
|
378
|
-
new_line=new_line,
|
|
379
|
-
xsize=xsize,
|
|
380
|
-
ysize=ysize,
|
|
381
|
-
add_page_index=add_page_index,
|
|
382
|
-
)
|
|
383
|
-
|
|
384
|
-
if add_caption and len(self.captions):
|
|
385
|
-
text = ""
|
|
386
|
-
for cap in self.captions:
|
|
387
|
-
text += cap.resolve(doc).text
|
|
388
|
-
|
|
389
|
-
if len(text):
|
|
390
|
-
body += f"{DocumentToken.BEG_CAPTION.value}"
|
|
391
|
-
body += f"{text.strip()}"
|
|
392
|
-
body += f"{DocumentToken.END_CAPTION.value}"
|
|
393
|
-
body += f"{new_line}"
|
|
394
|
-
|
|
395
|
-
body += f"{DocumentToken.END_FIGURE.value}{new_line}"
|
|
396
|
-
|
|
397
|
-
return body
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
class TableItem(FloatingItem):
|
|
401
|
-
"""TableItem."""
|
|
402
|
-
|
|
403
|
-
data: BaseTableData
|
|
404
|
-
label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
|
|
405
|
-
|
|
406
|
-
def export_to_dataframe(self) -> pd.DataFrame:
|
|
407
|
-
"""Export the table as a Pandas DataFrame."""
|
|
408
|
-
if self.data is None or self.data.num_rows == 0 or self.data.num_cols == 0:
|
|
409
|
-
return pd.DataFrame()
|
|
410
|
-
|
|
411
|
-
# Count how many rows are column headers
|
|
412
|
-
num_headers = 0
|
|
413
|
-
for i, row in enumerate(self.data.grid):
|
|
414
|
-
if len(row) == 0:
|
|
415
|
-
raise RuntimeError(
|
|
416
|
-
f"Invalid table. {len(row)=} but {self.data.num_cols=}."
|
|
417
|
-
)
|
|
418
|
-
|
|
419
|
-
any_header = False
|
|
420
|
-
for cell in row:
|
|
421
|
-
if cell.column_header:
|
|
422
|
-
any_header = True
|
|
423
|
-
break
|
|
424
|
-
|
|
425
|
-
if any_header:
|
|
426
|
-
num_headers += 1
|
|
427
|
-
else:
|
|
428
|
-
break
|
|
429
|
-
|
|
430
|
-
# Create the column names from all col_headers
|
|
431
|
-
columns: Optional[List[str]] = None
|
|
432
|
-
if num_headers > 0:
|
|
433
|
-
columns = ["" for _ in range(self.data.num_cols)]
|
|
434
|
-
for i in range(num_headers):
|
|
435
|
-
for j, cell in enumerate(self.data.grid[i]):
|
|
436
|
-
col_name = cell.text
|
|
437
|
-
if columns[j] != "":
|
|
438
|
-
col_name = f".{col_name}"
|
|
439
|
-
columns[j] += col_name
|
|
440
|
-
|
|
441
|
-
# Create table data
|
|
442
|
-
table_data = [
|
|
443
|
-
[cell.text for cell in row] for row in self.data.grid[num_headers:]
|
|
444
|
-
]
|
|
445
|
-
|
|
446
|
-
# Create DataFrame
|
|
447
|
-
df = pd.DataFrame(table_data, columns=columns)
|
|
448
|
-
|
|
449
|
-
return df
|
|
450
|
-
|
|
451
|
-
def export_to_html(self) -> str:
|
|
452
|
-
"""Export the table as html."""
|
|
453
|
-
body = ""
|
|
454
|
-
nrows = self.data.num_rows
|
|
455
|
-
ncols = self.data.num_cols
|
|
456
|
-
|
|
457
|
-
if not len(self.data.table_cells):
|
|
458
|
-
return ""
|
|
459
|
-
for i in range(nrows):
|
|
460
|
-
body += "<tr>"
|
|
461
|
-
for j in range(ncols):
|
|
462
|
-
cell: TableCell = self.data.grid[i][j]
|
|
463
|
-
|
|
464
|
-
rowspan, rowstart = (
|
|
465
|
-
cell.row_span,
|
|
466
|
-
cell.start_row_offset_idx,
|
|
467
|
-
)
|
|
468
|
-
colspan, colstart = (
|
|
469
|
-
cell.col_span,
|
|
470
|
-
cell.start_col_offset_idx,
|
|
471
|
-
)
|
|
472
|
-
|
|
473
|
-
if rowstart != i:
|
|
474
|
-
continue
|
|
475
|
-
if colstart != j:
|
|
476
|
-
continue
|
|
477
|
-
|
|
478
|
-
content = cell.text.strip()
|
|
479
|
-
celltag = "td"
|
|
480
|
-
if cell.column_header:
|
|
481
|
-
celltag = "th"
|
|
482
|
-
|
|
483
|
-
opening_tag = f"{celltag}"
|
|
484
|
-
if rowspan > 1:
|
|
485
|
-
opening_tag += f' rowspan="{rowspan}"'
|
|
486
|
-
if colspan > 1:
|
|
487
|
-
opening_tag += f' colspan="{colspan}"'
|
|
488
|
-
|
|
489
|
-
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
490
|
-
body += "</tr>"
|
|
491
|
-
body = f"<table>{body}</table>"
|
|
492
|
-
|
|
493
|
-
return body
|
|
494
|
-
|
|
495
|
-
def export_to_document_tokens(
|
|
496
|
-
self,
|
|
497
|
-
doc: "DoclingDocument",
|
|
498
|
-
new_line: str = "\n",
|
|
499
|
-
xsize: int = 100,
|
|
500
|
-
ysize: int = 100,
|
|
501
|
-
add_location: bool = True,
|
|
502
|
-
add_caption: bool = True,
|
|
503
|
-
add_content: bool = True,
|
|
504
|
-
add_cell_location: bool = True,
|
|
505
|
-
add_cell_label: bool = True,
|
|
506
|
-
add_cell_text: bool = True,
|
|
507
|
-
add_page_index: bool = True,
|
|
508
|
-
):
|
|
509
|
-
r"""Export table to document tokens format.
|
|
510
|
-
|
|
511
|
-
:param doc: "DoclingDocument":
|
|
512
|
-
:param new_line: str: (Default value = "\n")
|
|
513
|
-
:param xsize: int: (Default value = 100)
|
|
514
|
-
:param ysize: int: (Default value = 100)
|
|
515
|
-
:param add_location: bool: (Default value = True)
|
|
516
|
-
:param add_caption: bool: (Default value = True)
|
|
517
|
-
:param add_content: bool: (Default value = True)
|
|
518
|
-
:param add_cell_location: bool: (Default value = True)
|
|
519
|
-
:param add_cell_label: bool: (Default value = True)
|
|
520
|
-
:param add_cell_text: bool: (Default value = True)
|
|
521
|
-
:param add_page_index: bool: (Default value = True)
|
|
522
|
-
|
|
523
|
-
"""
|
|
524
|
-
body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
|
|
525
|
-
|
|
526
|
-
if add_location:
|
|
527
|
-
body += self.get_location_tokens(
|
|
528
|
-
doc=doc,
|
|
529
|
-
new_line=new_line,
|
|
530
|
-
xsize=xsize,
|
|
531
|
-
ysize=ysize,
|
|
532
|
-
add_page_index=add_page_index,
|
|
533
|
-
)
|
|
534
|
-
|
|
535
|
-
if add_caption and len(self.captions):
|
|
536
|
-
text = ""
|
|
537
|
-
for cap in self.captions:
|
|
538
|
-
text += cap.resolve(doc).text
|
|
539
|
-
|
|
540
|
-
if len(text):
|
|
541
|
-
body += f"{DocumentToken.BEG_CAPTION.value}"
|
|
542
|
-
body += f"{text.strip()}"
|
|
543
|
-
body += f"{DocumentToken.END_CAPTION.value}"
|
|
544
|
-
body += f"{new_line}"
|
|
545
|
-
|
|
546
|
-
if add_content and len(self.data.table_cells) > 0:
|
|
547
|
-
for i, row in enumerate(self.data.grid):
|
|
548
|
-
body += f"<row_{i}>"
|
|
549
|
-
for j, col in enumerate(row):
|
|
550
|
-
|
|
551
|
-
text = ""
|
|
552
|
-
if add_cell_text:
|
|
553
|
-
text = col.text.strip()
|
|
554
|
-
|
|
555
|
-
cell_loc = ""
|
|
556
|
-
if (
|
|
557
|
-
col.bbox is not None
|
|
558
|
-
and add_cell_location
|
|
559
|
-
and add_page_index
|
|
560
|
-
and len(self.prov) > 0
|
|
561
|
-
):
|
|
562
|
-
page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
|
|
563
|
-
cell_loc = DocumentToken.get_location(
|
|
564
|
-
bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
565
|
-
page_w=page_w,
|
|
566
|
-
page_h=page_h,
|
|
567
|
-
xsize=xsize,
|
|
568
|
-
ysize=ysize,
|
|
569
|
-
page_i=self.prov[0].page_no,
|
|
570
|
-
)
|
|
571
|
-
elif (
|
|
572
|
-
col.bbox is not None
|
|
573
|
-
and add_cell_location
|
|
574
|
-
and not add_page_index
|
|
575
|
-
and len(self.prov) > 0
|
|
576
|
-
):
|
|
577
|
-
page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
|
|
578
|
-
|
|
579
|
-
cell_loc = DocumentToken.get_location(
|
|
580
|
-
bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
581
|
-
page_w=page_w,
|
|
582
|
-
page_h=page_h,
|
|
583
|
-
xsize=xsize,
|
|
584
|
-
ysize=ysize,
|
|
585
|
-
page_i=-1,
|
|
586
|
-
)
|
|
587
|
-
|
|
588
|
-
cell_label = ""
|
|
589
|
-
if add_cell_label:
|
|
590
|
-
if col.column_header:
|
|
591
|
-
cell_label = "<col_header>"
|
|
592
|
-
elif col.row_header:
|
|
593
|
-
cell_label = "<row_header>"
|
|
594
|
-
elif col.row_section:
|
|
595
|
-
cell_label = "<row_section>"
|
|
596
|
-
else:
|
|
597
|
-
cell_label = "<body>"
|
|
598
|
-
|
|
599
|
-
body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
|
|
600
|
-
|
|
601
|
-
body += f"</row_{i}>{new_line}"
|
|
602
|
-
|
|
603
|
-
body += f"{DocumentToken.END_TABLE.value}{new_line}"
|
|
604
|
-
|
|
605
|
-
return body
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
class KeyValueItem(DocItem):
|
|
609
|
-
"""KeyValueItem."""
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
ContentItem = Union[TextItem, SectionHeaderItem, PictureItem, TableItem, KeyValueItem]
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
class PageItem(BaseModel):
|
|
616
|
-
"""PageItem."""
|
|
617
|
-
|
|
618
|
-
# A page carries separate root items for furniture and body,
|
|
619
|
-
# only referencing items on the page
|
|
620
|
-
size: Size
|
|
621
|
-
image: Optional[ImageRef] = None
|
|
622
|
-
page_no: int
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
class DescriptionItem(BaseModel):
|
|
626
|
-
"""DescriptionItem."""
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
class DoclingDocument(BaseModel):
|
|
630
|
-
"""DoclingDocument."""
|
|
631
|
-
|
|
632
|
-
schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
|
|
633
|
-
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
|
|
634
|
-
CURRENT_VERSION
|
|
635
|
-
)
|
|
636
|
-
description: DescriptionItem
|
|
637
|
-
name: str # The working name of this document, without extensions
|
|
638
|
-
# (could be taken from originating doc, or just "Untitled 1")
|
|
639
|
-
origin: Optional[DocumentOrigin] = (
|
|
640
|
-
None # DoclingDocuments may specify an origin (converted to DoclingDocument).
|
|
641
|
-
# This is optional, e.g. a DoclingDocument could also be entirely
|
|
642
|
-
# generated from synthetic data.
|
|
643
|
-
)
|
|
644
|
-
|
|
645
|
-
furniture: GroupItem = GroupItem(
|
|
646
|
-
name="_root_", self_ref="#/furniture"
|
|
647
|
-
) # List[RefItem] = []
|
|
648
|
-
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
|
|
649
|
-
|
|
650
|
-
groups: List[GroupItem] = []
|
|
651
|
-
texts: List[Union[SectionHeaderItem, TextItem]] = []
|
|
652
|
-
pictures: List[PictureItem] = []
|
|
653
|
-
tables: List[TableItem] = []
|
|
654
|
-
key_value_items: List[KeyValueItem] = []
|
|
655
|
-
|
|
656
|
-
pages: Dict[int, PageItem] = {} # empty as default
|
|
657
|
-
|
|
658
|
-
def add_group(
|
|
659
|
-
self,
|
|
660
|
-
label: Optional[GroupLabel] = None,
|
|
661
|
-
name: Optional[str] = None,
|
|
662
|
-
parent: Optional[GroupItem] = None,
|
|
663
|
-
) -> GroupItem:
|
|
664
|
-
"""add_group.
|
|
665
|
-
|
|
666
|
-
:param label: Optional[GroupLabel]: (Default value = None)
|
|
667
|
-
:param name: Optional[str]: (Default value = None)
|
|
668
|
-
:param parent: Optional[GroupItem]: (Default value = None)
|
|
669
|
-
|
|
670
|
-
"""
|
|
671
|
-
if not parent:
|
|
672
|
-
parent = self.body
|
|
673
|
-
|
|
674
|
-
group_index = len(self.groups)
|
|
675
|
-
cref = f"#/groups/{group_index}"
|
|
676
|
-
|
|
677
|
-
group = GroupItem(self_ref=cref, parent=parent.get_ref())
|
|
678
|
-
if name is not None:
|
|
679
|
-
group.name = name
|
|
680
|
-
if label is not None:
|
|
681
|
-
group.label = label
|
|
682
|
-
|
|
683
|
-
self.groups.append(group)
|
|
684
|
-
parent.children.append(RefItem(cref=cref))
|
|
685
|
-
|
|
686
|
-
return group
|
|
687
|
-
|
|
688
|
-
def add_text(
|
|
689
|
-
self,
|
|
690
|
-
label: str,
|
|
691
|
-
text: str,
|
|
692
|
-
orig: Optional[str] = None,
|
|
693
|
-
prov: Optional[ProvenanceItem] = None,
|
|
694
|
-
parent: Optional[GroupItem] = None,
|
|
695
|
-
):
|
|
696
|
-
"""add_paragraph.
|
|
697
|
-
|
|
698
|
-
:param label: str:
|
|
699
|
-
:param text: str:
|
|
700
|
-
:param orig: Optional[str]: (Default value = None)
|
|
701
|
-
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
702
|
-
:param parent: Optional[GroupItem]: (Default value = None)
|
|
703
|
-
|
|
704
|
-
"""
|
|
705
|
-
if not parent:
|
|
706
|
-
parent = self.body
|
|
707
|
-
|
|
708
|
-
if not orig:
|
|
709
|
-
orig = text
|
|
710
|
-
|
|
711
|
-
text_index = len(self.texts)
|
|
712
|
-
cref = f"#/texts/{text_index}"
|
|
713
|
-
text_item = TextItem(
|
|
714
|
-
label=label,
|
|
715
|
-
text=text,
|
|
716
|
-
orig=orig,
|
|
717
|
-
self_ref=cref,
|
|
718
|
-
parent=parent.get_ref(),
|
|
719
|
-
)
|
|
720
|
-
if prov:
|
|
721
|
-
text_item.prov.append(prov)
|
|
722
|
-
|
|
723
|
-
self.texts.append(text_item)
|
|
724
|
-
parent.children.append(RefItem(cref=cref))
|
|
725
|
-
|
|
726
|
-
return text_item
|
|
727
|
-
|
|
728
|
-
def add_table(
|
|
729
|
-
self,
|
|
730
|
-
data: BaseTableData,
|
|
731
|
-
caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
|
|
732
|
-
prov: Optional[ProvenanceItem] = None,
|
|
733
|
-
parent: Optional[GroupItem] = None,
|
|
734
|
-
):
|
|
735
|
-
"""add_table.
|
|
736
|
-
|
|
737
|
-
:param data: BaseTableData:
|
|
738
|
-
:param caption: Optional[Union[TextItem:
|
|
739
|
-
:param RefItem]]: (Default value = None)
|
|
740
|
-
:param # This is not cool yet.prov: Optional[ProvenanceItem]
|
|
741
|
-
:param parent: Optional[GroupItem]: (Default value = None)
|
|
742
|
-
|
|
743
|
-
"""
|
|
744
|
-
if not parent:
|
|
745
|
-
parent = self.body
|
|
746
|
-
|
|
747
|
-
table_index = len(self.tables)
|
|
748
|
-
cref = f"#/tables/{table_index}"
|
|
749
|
-
|
|
750
|
-
tbl_item = TableItem(
|
|
751
|
-
label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
|
|
752
|
-
)
|
|
753
|
-
if prov:
|
|
754
|
-
tbl_item.prov.append(prov)
|
|
755
|
-
if caption:
|
|
756
|
-
tbl_item.captions.append(caption.get_ref())
|
|
757
|
-
|
|
758
|
-
self.tables.append(tbl_item)
|
|
759
|
-
parent.children.append(RefItem(cref=cref))
|
|
760
|
-
|
|
761
|
-
return tbl_item
|
|
762
|
-
|
|
763
|
-
def add_picture(
|
|
764
|
-
self,
|
|
765
|
-
data: BasePictureData,
|
|
766
|
-
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
767
|
-
prov: Optional[ProvenanceItem] = None,
|
|
768
|
-
parent: Optional[GroupItem] = None,
|
|
769
|
-
):
|
|
770
|
-
"""add_picture.
|
|
771
|
-
|
|
772
|
-
:param data: BasePictureData:
|
|
773
|
-
:param caption: Optional[Union[TextItem:
|
|
774
|
-
:param RefItem]]: (Default value = None)
|
|
775
|
-
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
776
|
-
:param parent: Optional[GroupItem]: (Default value = None)
|
|
777
|
-
|
|
778
|
-
"""
|
|
779
|
-
if not parent:
|
|
780
|
-
parent = self.body
|
|
781
|
-
|
|
782
|
-
picture_index = len(self.pictures)
|
|
783
|
-
cref = f"#/pictures/{picture_index}"
|
|
784
|
-
|
|
785
|
-
fig_item = PictureItem(
|
|
786
|
-
label=DocItemLabel.PICTURE,
|
|
787
|
-
data=data,
|
|
788
|
-
self_ref=cref,
|
|
789
|
-
parent=parent.get_ref(),
|
|
790
|
-
)
|
|
791
|
-
if prov:
|
|
792
|
-
fig_item.prov.append(prov)
|
|
793
|
-
if caption:
|
|
794
|
-
fig_item.captions.append(caption.get_ref())
|
|
795
|
-
|
|
796
|
-
self.pictures.append(fig_item)
|
|
797
|
-
parent.children.append(RefItem(cref=cref))
|
|
798
|
-
|
|
799
|
-
return fig_item
|
|
800
|
-
|
|
801
|
-
def add_heading(
|
|
802
|
-
self,
|
|
803
|
-
text: str,
|
|
804
|
-
orig: Optional[str] = None,
|
|
805
|
-
level: LevelNumber = 1,
|
|
806
|
-
prov: Optional[ProvenanceItem] = None,
|
|
807
|
-
parent: Optional[GroupItem] = None,
|
|
808
|
-
):
|
|
809
|
-
"""add_heading.
|
|
810
|
-
|
|
811
|
-
:param label: DocItemLabel:
|
|
812
|
-
:param text: str:
|
|
813
|
-
:param orig: Optional[str]: (Default value = None)
|
|
814
|
-
:param level: LevelNumber: (Default value = 1)
|
|
815
|
-
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
816
|
-
:param parent: Optional[GroupItem]: (Default value = None)
|
|
817
|
-
|
|
818
|
-
"""
|
|
819
|
-
if not parent:
|
|
820
|
-
parent = self.body
|
|
821
|
-
|
|
822
|
-
if not orig:
|
|
823
|
-
orig = text
|
|
824
|
-
|
|
825
|
-
text_index = len(self.texts)
|
|
826
|
-
cref = f"#/texts/{text_index}"
|
|
827
|
-
section_header_item = SectionHeaderItem(
|
|
828
|
-
level=level,
|
|
829
|
-
text=text,
|
|
830
|
-
orig=orig,
|
|
831
|
-
self_ref=cref,
|
|
832
|
-
parent=parent.get_ref(),
|
|
833
|
-
)
|
|
834
|
-
if prov:
|
|
835
|
-
section_header_item.prov.append(prov)
|
|
836
|
-
|
|
837
|
-
self.texts.append(section_header_item)
|
|
838
|
-
parent.children.append(RefItem(cref=cref))
|
|
839
|
-
|
|
840
|
-
return section_header_item
|
|
841
|
-
|
|
842
|
-
def num_pages(self):
|
|
843
|
-
"""num_pages."""
|
|
844
|
-
return len(self.pages.values())
|
|
845
|
-
|
|
846
|
-
def validate_tree(self, root) -> bool:
|
|
847
|
-
"""validate_tree."""
|
|
848
|
-
res = []
|
|
849
|
-
for child_ref in root.children:
|
|
850
|
-
child = child_ref.resolve(self)
|
|
851
|
-
if child.parent.resolve(self) != root:
|
|
852
|
-
return False
|
|
853
|
-
res.append(self.validate_tree(child))
|
|
854
|
-
|
|
855
|
-
return all(res) or len(res) == 0
|
|
856
|
-
|
|
857
|
-
def iterate_items(
|
|
858
|
-
self,
|
|
859
|
-
root: Optional[NodeItem] = None,
|
|
860
|
-
with_groups: bool = False,
|
|
861
|
-
traverse_pictures: bool = True,
|
|
862
|
-
page_no: Optional[int] = None,
|
|
863
|
-
_level: int = 0, # fixed parameter, carries through the node nesting level
|
|
864
|
-
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
865
|
-
"""iterate_elements.
|
|
866
|
-
|
|
867
|
-
:param root: Optional[NodeItem]: (Default value = None)
|
|
868
|
-
:param with_groups: bool: (Default value = False)
|
|
869
|
-
:param traverse_pictures: bool: (Default value = True)
|
|
870
|
-
:param page_no: Optional[int]: (Default value = None)
|
|
871
|
-
:param _level: (Default value = 0)
|
|
872
|
-
:param # fixed parameter:
|
|
873
|
-
:param carries through the node nesting level:
|
|
874
|
-
"""
|
|
875
|
-
if not root:
|
|
876
|
-
root = self.body
|
|
877
|
-
|
|
878
|
-
if not isinstance(root, GroupItem) or with_groups:
|
|
879
|
-
if isinstance(root, DocItem):
|
|
880
|
-
if page_no is not None:
|
|
881
|
-
for prov in root.prov:
|
|
882
|
-
if prov.page_no == page_no:
|
|
883
|
-
yield root, _level
|
|
884
|
-
else:
|
|
885
|
-
yield root, _level
|
|
886
|
-
else:
|
|
887
|
-
yield root, _level
|
|
888
|
-
|
|
889
|
-
# Traverse children
|
|
890
|
-
for child_ref in root.children:
|
|
891
|
-
child = child_ref.resolve(self)
|
|
892
|
-
|
|
893
|
-
if isinstance(child, NodeItem):
|
|
894
|
-
# If the child is a NodeItem, recursively traverse it
|
|
895
|
-
if not isinstance(child, PictureItem) or traverse_pictures:
|
|
896
|
-
yield from self.iterate_items(
|
|
897
|
-
child, _level=_level + 1, with_groups=with_groups
|
|
898
|
-
)
|
|
899
|
-
|
|
900
|
-
def print_element_tree(self):
|
|
901
|
-
"""print_element_tree."""
|
|
902
|
-
for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
|
|
903
|
-
if isinstance(item, GroupItem):
|
|
904
|
-
print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
|
|
905
|
-
elif isinstance(item, DocItem):
|
|
906
|
-
print(" " * level, f"{ix}: {item.label.value}")
|
|
907
|
-
|
|
908
|
-
def export_to_markdown(
|
|
909
|
-
self,
|
|
910
|
-
delim: str = "\n\n",
|
|
911
|
-
from_element: int = 0,
|
|
912
|
-
to_element: Optional[int] = None,
|
|
913
|
-
labels: list[DocItemLabel] = [
|
|
914
|
-
DocItemLabel.TITLE,
|
|
915
|
-
DocItemLabel.SECTION_HEADER,
|
|
916
|
-
DocItemLabel.PARAGRAPH,
|
|
917
|
-
DocItemLabel.CAPTION,
|
|
918
|
-
DocItemLabel.TABLE,
|
|
919
|
-
DocItemLabel.TEXT,
|
|
920
|
-
],
|
|
921
|
-
strict_text: bool = False,
|
|
922
|
-
) -> str:
|
|
923
|
-
r"""Serialize to Markdown.
|
|
924
|
-
|
|
925
|
-
Operates on a slice of the document's main_text as defined through arguments
|
|
926
|
-
main_text_start and main_text_stop; defaulting to the whole main_text.
|
|
927
|
-
|
|
928
|
-
:param delim: Delimiter to use when concatenating the various
|
|
929
|
-
Markdown parts. Defaults to "\n\n".
|
|
930
|
-
:type delim: str
|
|
931
|
-
:param from_element: Body slicing start index (inclusive).
|
|
932
|
-
Defaults to 0.
|
|
933
|
-
:type from_element: int
|
|
934
|
-
:param to_element: Body slicing stop index
|
|
935
|
-
(exclusive). Defaults to None.
|
|
936
|
-
:type to_element: Optional[int]
|
|
937
|
-
:param delim: str: (Default value = "\n\n")
|
|
938
|
-
:param from_element: int: (Default value = 0)
|
|
939
|
-
:param to_element: Optional[int]: (Default value = None)
|
|
940
|
-
:param labels: list[DocItemLabel]
|
|
941
|
-
:param "subtitle-level-1":
|
|
942
|
-
:param "paragraph":
|
|
943
|
-
:param "caption":
|
|
944
|
-
:param "table":
|
|
945
|
-
:param "Text":
|
|
946
|
-
:param "text":
|
|
947
|
-
:param ]:
|
|
948
|
-
:param strict_text: bool: (Default value = False)
|
|
949
|
-
:returns: The exported Markdown representation.
|
|
950
|
-
:rtype: str
|
|
951
|
-
"""
|
|
952
|
-
has_title = False
|
|
953
|
-
prev_text = ""
|
|
954
|
-
md_texts: list[str] = []
|
|
955
|
-
|
|
956
|
-
skip_count = 0
|
|
957
|
-
for ix, (item, level) in enumerate(self.iterate_items(self.body)):
|
|
958
|
-
if skip_count < from_element:
|
|
959
|
-
skip_count += 1
|
|
960
|
-
continue # skip as many items as you want
|
|
961
|
-
|
|
962
|
-
if to_element and ix >= to_element:
|
|
963
|
-
break
|
|
964
|
-
|
|
965
|
-
markdown_text = ""
|
|
966
|
-
|
|
967
|
-
if isinstance(item, DocItem):
|
|
968
|
-
item_type = item.label
|
|
969
|
-
|
|
970
|
-
if isinstance(item, TextItem) and item_type in labels:
|
|
971
|
-
text = item.text
|
|
972
|
-
|
|
973
|
-
# ignore repeated text
|
|
974
|
-
if prev_text == text or text is None:
|
|
975
|
-
continue
|
|
976
|
-
else:
|
|
977
|
-
prev_text = text
|
|
978
|
-
|
|
979
|
-
# first title match
|
|
980
|
-
if item_type == "title" and not has_title:
|
|
981
|
-
if strict_text:
|
|
982
|
-
markdown_text = f"{text}"
|
|
983
|
-
else:
|
|
984
|
-
markdown_text = f"# {text}"
|
|
985
|
-
has_title = True
|
|
986
|
-
|
|
987
|
-
# secondary titles
|
|
988
|
-
elif item_type in {"title", "subtitle-level-1"} or (
|
|
989
|
-
has_title and item_type == "title"
|
|
990
|
-
):
|
|
991
|
-
if strict_text:
|
|
992
|
-
markdown_text = f"{text}"
|
|
993
|
-
else:
|
|
994
|
-
markdown_text = f"## {text}"
|
|
995
|
-
|
|
996
|
-
# normal text
|
|
997
|
-
else:
|
|
998
|
-
markdown_text = text
|
|
999
|
-
|
|
1000
|
-
elif (
|
|
1001
|
-
isinstance(item, TableItem)
|
|
1002
|
-
and item.data
|
|
1003
|
-
and item_type in labels
|
|
1004
|
-
and not strict_text
|
|
1005
|
-
):
|
|
1006
|
-
table = []
|
|
1007
|
-
for row in item.data.grid:
|
|
1008
|
-
tmp = []
|
|
1009
|
-
for col in row:
|
|
1010
|
-
tmp.append(col.text)
|
|
1011
|
-
table.append(tmp)
|
|
1012
|
-
|
|
1013
|
-
if len(table) > 1 and len(table[0]) > 0:
|
|
1014
|
-
try:
|
|
1015
|
-
md_table = tabulate(
|
|
1016
|
-
table[1:], headers=table[0], tablefmt="github"
|
|
1017
|
-
)
|
|
1018
|
-
except ValueError:
|
|
1019
|
-
md_table = tabulate(
|
|
1020
|
-
table[1:],
|
|
1021
|
-
headers=table[0],
|
|
1022
|
-
tablefmt="github",
|
|
1023
|
-
disable_numparse=True,
|
|
1024
|
-
)
|
|
1025
|
-
|
|
1026
|
-
markdown_text = md_table
|
|
1027
|
-
|
|
1028
|
-
if markdown_text:
|
|
1029
|
-
md_texts.append(markdown_text)
|
|
1030
|
-
|
|
1031
|
-
result = delim.join(md_texts)
|
|
1032
|
-
return result
|
|
1033
|
-
|
|
1034
|
-
def export_to_document_tokens(
|
|
1035
|
-
self,
|
|
1036
|
-
delim: str = "\n\n",
|
|
1037
|
-
from_element: int = 0,
|
|
1038
|
-
to_element: Optional[int] = None,
|
|
1039
|
-
labels: list[DocItemLabel] = [
|
|
1040
|
-
DocItemLabel.TITLE,
|
|
1041
|
-
DocItemLabel.SECTION_HEADER,
|
|
1042
|
-
DocItemLabel.PARAGRAPH,
|
|
1043
|
-
DocItemLabel.CAPTION,
|
|
1044
|
-
DocItemLabel.TABLE,
|
|
1045
|
-
DocItemLabel.TEXT,
|
|
1046
|
-
],
|
|
1047
|
-
xsize: int = 100,
|
|
1048
|
-
ysize: int = 100,
|
|
1049
|
-
add_location: bool = True,
|
|
1050
|
-
add_content: bool = True,
|
|
1051
|
-
add_page_index: bool = True,
|
|
1052
|
-
# table specific flags
|
|
1053
|
-
add_table_cell_location: bool = False,
|
|
1054
|
-
add_table_cell_label: bool = True,
|
|
1055
|
-
add_table_cell_text: bool = True,
|
|
1056
|
-
) -> str:
|
|
1057
|
-
r"""Exports the document content to an DocumentToken format.
|
|
1058
|
-
|
|
1059
|
-
Operates on a slice of the document's body as defined through arguments
|
|
1060
|
-
from_element and to_element; defaulting to the whole main_text.
|
|
1061
|
-
|
|
1062
|
-
:param delim: str: (Default value = "\n\n")
|
|
1063
|
-
:param from_element: int: (Default value = 0)
|
|
1064
|
-
:param to_element: Optional[int]: (Default value = None)
|
|
1065
|
-
:param labels: list[DocItemLabel]
|
|
1066
|
-
:param xsize: int: (Default value = 100)
|
|
1067
|
-
:param ysize: int: (Default value = 100)
|
|
1068
|
-
:param add_location: bool: (Default value = True)
|
|
1069
|
-
:param add_content: bool: (Default value = True)
|
|
1070
|
-
:param add_page_index: bool: (Default value = True)
|
|
1071
|
-
:param # table specific flagsadd_table_cell_location: bool
|
|
1072
|
-
:param add_table_cell_label: bool: (Default value = True)
|
|
1073
|
-
:param add_table_cell_text: bool: (Default value = True)
|
|
1074
|
-
:returns: The content of the document formatted as a DocTags string.
|
|
1075
|
-
:rtype: str
|
|
1076
|
-
"""
|
|
1077
|
-
new_line = ""
|
|
1078
|
-
if delim:
|
|
1079
|
-
new_line = "\n"
|
|
1080
|
-
|
|
1081
|
-
doctags = f"{DocumentToken.BEG_DOCUMENT.value}{new_line}"
|
|
1082
|
-
|
|
1083
|
-
# pagedims = self.get_map_to_page_dimensions()
|
|
1084
|
-
|
|
1085
|
-
skip_count = 0
|
|
1086
|
-
for ix, (item, level) in enumerate(self.iterate_items(self.body)):
|
|
1087
|
-
if skip_count < from_element:
|
|
1088
|
-
skip_count += 1
|
|
1089
|
-
continue # skip as many items as you want
|
|
1090
|
-
|
|
1091
|
-
if to_element and ix >= to_element:
|
|
1092
|
-
break
|
|
1093
|
-
|
|
1094
|
-
if not isinstance(item, DocItem):
|
|
1095
|
-
continue
|
|
1096
|
-
|
|
1097
|
-
prov = item.prov
|
|
1098
|
-
|
|
1099
|
-
page_i = -1
|
|
1100
|
-
|
|
1101
|
-
if add_location and len(self.pages) and len(prov) > 0:
|
|
1102
|
-
|
|
1103
|
-
page_i = prov[0].page_no
|
|
1104
|
-
page_dim = self.pages[page_i].size
|
|
1105
|
-
|
|
1106
|
-
float(page_dim.width)
|
|
1107
|
-
float(page_dim.height)
|
|
1108
|
-
|
|
1109
|
-
item_type = item.label
|
|
1110
|
-
if isinstance(item, TextItem) and (item_type in labels):
|
|
1111
|
-
|
|
1112
|
-
doctags += item.export_to_document_tokens(
|
|
1113
|
-
doc=self,
|
|
1114
|
-
new_line=new_line,
|
|
1115
|
-
xsize=xsize,
|
|
1116
|
-
ysize=ysize,
|
|
1117
|
-
add_location=add_location,
|
|
1118
|
-
add_content=add_content,
|
|
1119
|
-
add_page_index=add_page_index,
|
|
1120
|
-
)
|
|
1121
|
-
|
|
1122
|
-
elif isinstance(item, TableItem) and (item_type in labels):
|
|
1123
|
-
|
|
1124
|
-
doctags += item.export_to_document_tokens(
|
|
1125
|
-
doc=self,
|
|
1126
|
-
new_line=new_line,
|
|
1127
|
-
xsize=xsize,
|
|
1128
|
-
ysize=ysize,
|
|
1129
|
-
add_caption=True,
|
|
1130
|
-
add_location=add_location,
|
|
1131
|
-
add_content=add_content,
|
|
1132
|
-
add_cell_location=add_table_cell_location,
|
|
1133
|
-
add_cell_label=add_table_cell_label,
|
|
1134
|
-
add_cell_text=add_table_cell_text,
|
|
1135
|
-
add_page_index=add_page_index,
|
|
1136
|
-
)
|
|
1137
|
-
|
|
1138
|
-
elif isinstance(item, PictureItem) and (item_type in labels):
|
|
1139
|
-
|
|
1140
|
-
doctags += item.export_to_document_tokens(
|
|
1141
|
-
doc=self,
|
|
1142
|
-
new_line=new_line,
|
|
1143
|
-
xsize=xsize,
|
|
1144
|
-
ysize=ysize,
|
|
1145
|
-
add_caption=True,
|
|
1146
|
-
add_location=add_location,
|
|
1147
|
-
add_content=add_content,
|
|
1148
|
-
add_page_index=add_page_index,
|
|
1149
|
-
)
|
|
1150
|
-
|
|
1151
|
-
doctags += DocumentToken.END_DOCUMENT.value
|
|
1152
|
-
|
|
1153
|
-
return doctags
|
|
1154
|
-
|
|
1155
|
-
def add_page(self, page_no: int, size: Size) -> PageItem:
|
|
1156
|
-
"""add_page.
|
|
1157
|
-
|
|
1158
|
-
:param page_no: int:
|
|
1159
|
-
:param size: Size:
|
|
1160
|
-
|
|
1161
|
-
"""
|
|
1162
|
-
pitem = PageItem(page_no=page_no, size=size)
|
|
1163
|
-
|
|
1164
|
-
self.pages[page_no] = pitem
|
|
1165
|
-
return pitem
|
|
1166
|
-
|
|
1167
|
-
@field_validator("version")
|
|
1168
|
-
@classmethod
|
|
1169
|
-
def check_version_is_compatible(cls, v: str) -> str:
|
|
1170
|
-
"""Check if this document version is compatible with current version."""
|
|
1171
|
-
current_match = re.match(VERSION_PATTERN, CURRENT_VERSION)
|
|
1172
|
-
doc_match = re.match(VERSION_PATTERN, v)
|
|
1173
|
-
if (
|
|
1174
|
-
doc_match is None
|
|
1175
|
-
or current_match is None
|
|
1176
|
-
or doc_match["major"] != current_match["major"]
|
|
1177
|
-
or doc_match["minor"] > current_match["minor"]
|
|
1178
|
-
):
|
|
1179
|
-
raise ValueError(
|
|
1180
|
-
f"incompatible version {v} with schema version {CURRENT_VERSION}"
|
|
1181
|
-
)
|
|
1182
|
-
else:
|
|
1183
|
-
return CURRENT_VERSION
|
|
1184
|
-
|
|
1185
|
-
@model_validator(mode="after") # type: ignore
|
|
1186
|
-
@classmethod
|
|
1187
|
-
def validate_document(cls, d: "DoclingDocument"):
|
|
1188
|
-
"""validate_document."""
|
|
1189
|
-
if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
|
|
1190
|
-
raise ValueError("Document hierachy is inconsistent.")
|
|
1191
|
-
|
|
1192
|
-
return d
|