docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +33 -37
- docling/backend/asciidoc_backend.py +431 -0
- docling/backend/docling_parse_backend.py +20 -16
- docling/backend/docling_parse_v2_backend.py +248 -0
- docling/backend/html_backend.py +429 -0
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +496 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +16 -11
- docling/cli/main.py +96 -65
- docling/datamodel/base_models.py +79 -193
- docling/datamodel/document.py +405 -320
- docling/datamodel/pipeline_options.py +19 -3
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +240 -251
- docling/models/base_model.py +28 -0
- docling/models/base_ocr_model.py +40 -10
- docling/models/ds_glm_model.py +244 -30
- docling/models/easyocr_model.py +57 -42
- docling/models/layout_model.py +158 -116
- docling/models/page_assemble_model.py +127 -101
- docling/models/page_preprocessing_model.py +79 -0
- docling/models/table_structure_model.py +162 -116
- docling/models/tesseract_ocr_cli_model.py +76 -59
- docling/models/tesseract_ocr_model.py +90 -58
- docling/pipeline/base_pipeline.py +189 -0
- docling/pipeline/simple_pipeline.py +56 -0
- docling/pipeline/standard_pdf_pipeline.py +201 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling/utils/profiling.py +62 -0
- docling-2.4.1.dist-info/METADATA +154 -0
- docling-2.4.1.dist-info/RECORD +45 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
docling/datamodel/document.py
CHANGED
@@ -1,87 +1,113 @@
|
|
1
1
|
import logging
|
2
|
+
import re
|
3
|
+
from enum import Enum
|
2
4
|
from io import BytesIO
|
3
5
|
from pathlib import Path, PurePath
|
4
|
-
from typing import
|
5
|
-
|
6
|
-
|
7
|
-
from docling_core.types import
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
6
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
7
|
+
|
8
|
+
import filetype
|
9
|
+
from docling_core.types.doc import (
|
10
|
+
DocItem,
|
11
|
+
DocItemLabel,
|
12
|
+
DoclingDocument,
|
13
|
+
PictureItem,
|
14
|
+
SectionHeaderItem,
|
15
|
+
TableItem,
|
16
|
+
TextItem,
|
17
|
+
)
|
18
|
+
from docling_core.types.doc.document import ListItem
|
19
|
+
from docling_core.types.legacy_doc.base import (
|
20
|
+
BaseText,
|
21
|
+
Figure,
|
22
|
+
GlmTableCell,
|
23
|
+
PageDimensions,
|
24
|
+
PageReference,
|
25
|
+
Prov,
|
26
|
+
Ref,
|
27
|
+
)
|
28
|
+
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
29
|
+
from docling_core.types.legacy_doc.base import TableCell
|
30
|
+
from docling_core.types.legacy_doc.document import (
|
31
|
+
CCSDocumentDescription as DsDocumentDescription,
|
32
|
+
)
|
33
|
+
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
34
|
+
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
35
|
+
from docling_core.utils.file import resolve_file_source
|
15
36
|
from pydantic import BaseModel
|
16
37
|
from typing_extensions import deprecated
|
17
38
|
|
18
|
-
from docling.backend.abstract_backend import
|
19
|
-
|
39
|
+
from docling.backend.abstract_backend import (
|
40
|
+
AbstractDocumentBackend,
|
41
|
+
PaginatedDocumentBackend,
|
42
|
+
)
|
20
43
|
from docling.datamodel.base_models import (
|
21
44
|
AssembledUnit,
|
22
45
|
ConversionStatus,
|
23
46
|
DocumentStream,
|
24
47
|
ErrorItem,
|
25
|
-
|
48
|
+
FormatToExtensions,
|
49
|
+
FormatToMimeType,
|
50
|
+
InputFormat,
|
51
|
+
MimeTypeToFormat,
|
26
52
|
Page,
|
27
|
-
PageElement,
|
28
|
-
TableElement,
|
29
|
-
TextElement,
|
30
53
|
)
|
31
54
|
from docling.datamodel.settings import DocumentLimits
|
32
|
-
from docling.utils.
|
55
|
+
from docling.utils.profiling import ProfilingItem
|
56
|
+
from docling.utils.utils import create_file_hash, create_hash
|
57
|
+
|
58
|
+
if TYPE_CHECKING:
|
59
|
+
from docling.document_converter import FormatOption
|
33
60
|
|
34
61
|
_log = logging.getLogger(__name__)
|
35
62
|
|
36
63
|
layout_label_to_ds_type = {
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
64
|
+
DocItemLabel.TITLE: "title",
|
65
|
+
DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
|
66
|
+
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
|
67
|
+
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
|
68
|
+
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
|
69
|
+
DocItemLabel.CAPTION: "caption",
|
70
|
+
DocItemLabel.PAGE_HEADER: "page-header",
|
71
|
+
DocItemLabel.PAGE_FOOTER: "page-footer",
|
72
|
+
DocItemLabel.FOOTNOTE: "footnote",
|
73
|
+
DocItemLabel.TABLE: "table",
|
74
|
+
DocItemLabel.FORMULA: "equation",
|
75
|
+
DocItemLabel.LIST_ITEM: "paragraph",
|
76
|
+
DocItemLabel.CODE: "paragraph",
|
77
|
+
DocItemLabel.PICTURE: "figure",
|
78
|
+
DocItemLabel.TEXT: "paragraph",
|
79
|
+
DocItemLabel.PARAGRAPH: "paragraph",
|
52
80
|
}
|
53
81
|
|
54
|
-
|
55
|
-
_name="",
|
56
|
-
description=DsDocumentDescription(logs=[]),
|
57
|
-
file_info=DsFileInfoObject(
|
58
|
-
filename="",
|
59
|
-
document_hash="",
|
60
|
-
),
|
61
|
-
)
|
82
|
+
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
62
83
|
|
63
84
|
|
64
85
|
class InputDocument(BaseModel):
|
65
|
-
file: PurePath
|
66
|
-
document_hash:
|
67
|
-
valid: bool =
|
86
|
+
file: PurePath
|
87
|
+
document_hash: str # = None
|
88
|
+
valid: bool = True
|
68
89
|
limits: DocumentLimits = DocumentLimits()
|
90
|
+
format: InputFormat # = None
|
69
91
|
|
70
92
|
filesize: Optional[int] = None
|
71
|
-
page_count:
|
93
|
+
page_count: int = 0
|
72
94
|
|
73
|
-
_backend:
|
95
|
+
_backend: AbstractDocumentBackend # Internal PDF backend used
|
74
96
|
|
75
97
|
def __init__(
|
76
98
|
self,
|
77
99
|
path_or_stream: Union[BytesIO, Path],
|
100
|
+
format: InputFormat,
|
101
|
+
backend: Type[AbstractDocumentBackend],
|
78
102
|
filename: Optional[str] = None,
|
79
103
|
limits: Optional[DocumentLimits] = None,
|
80
|
-
pdf_backend=DoclingParseDocumentBackend,
|
81
104
|
):
|
82
|
-
super().__init__(
|
105
|
+
super().__init__(
|
106
|
+
file="", document_hash="", format=InputFormat.PDF
|
107
|
+
) # initialize with dummy values
|
83
108
|
|
84
109
|
self.limits = limits or DocumentLimits()
|
110
|
+
self.format = format
|
85
111
|
|
86
112
|
try:
|
87
113
|
if isinstance(path_or_stream, Path):
|
@@ -91,11 +117,12 @@ class InputDocument(BaseModel):
|
|
91
117
|
self.valid = False
|
92
118
|
else:
|
93
119
|
self.document_hash = create_file_hash(path_or_stream)
|
94
|
-
self.
|
95
|
-
path_or_stream=path_or_stream, document_hash=self.document_hash
|
96
|
-
)
|
120
|
+
self._init_doc(backend, path_or_stream)
|
97
121
|
|
98
122
|
elif isinstance(path_or_stream, BytesIO):
|
123
|
+
assert (
|
124
|
+
filename is not None
|
125
|
+
), "Can't construct InputDocument from stream without providing filename arg."
|
99
126
|
self.file = PurePath(filename)
|
100
127
|
self.filesize = path_or_stream.getbuffer().nbytes
|
101
128
|
|
@@ -103,31 +130,57 @@ class InputDocument(BaseModel):
|
|
103
130
|
self.valid = False
|
104
131
|
else:
|
105
132
|
self.document_hash = create_file_hash(path_or_stream)
|
106
|
-
self.
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
self.page_count = self._backend.page_count()
|
133
|
+
self._init_doc(backend, path_or_stream)
|
134
|
+
else:
|
135
|
+
raise RuntimeError(
|
136
|
+
f"Unexpected type path_or_stream: {type(path_or_stream)}"
|
137
|
+
)
|
112
138
|
|
113
|
-
|
114
|
-
|
139
|
+
# For paginated backends, check if the maximum page count is exceeded.
|
140
|
+
if self.valid and self._backend.is_valid():
|
141
|
+
if self._backend.supports_pagination() and isinstance(
|
142
|
+
self._backend, PaginatedDocumentBackend
|
143
|
+
):
|
144
|
+
self.page_count = self._backend.page_count()
|
145
|
+
if not self.page_count <= self.limits.max_num_pages:
|
146
|
+
self.valid = False
|
115
147
|
|
116
148
|
except (FileNotFoundError, OSError) as e:
|
149
|
+
self.valid = False
|
117
150
|
_log.exception(
|
118
151
|
f"File {self.file.name} not found or cannot be opened.", exc_info=e
|
119
152
|
)
|
120
153
|
# raise
|
121
154
|
except RuntimeError as e:
|
155
|
+
self.valid = False
|
122
156
|
_log.exception(
|
123
157
|
f"An unexpected error occurred while opening the document {self.file.name}",
|
124
158
|
exc_info=e,
|
125
159
|
)
|
126
160
|
# raise
|
127
161
|
|
162
|
+
def _init_doc(
|
163
|
+
self,
|
164
|
+
backend: Type[AbstractDocumentBackend],
|
165
|
+
path_or_stream: Union[BytesIO, Path],
|
166
|
+
) -> None:
|
167
|
+
if backend is None:
|
168
|
+
raise RuntimeError(
|
169
|
+
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
170
|
+
f"Please check your format configuration on DocumentConverter."
|
171
|
+
)
|
172
|
+
|
173
|
+
self._backend = backend(self, path_or_stream=path_or_stream)
|
174
|
+
if not self._backend.is_valid():
|
175
|
+
self.valid = False
|
128
176
|
|
129
|
-
|
130
|
-
class
|
177
|
+
|
178
|
+
class DocumentFormat(str, Enum):
|
179
|
+
V2 = "v2"
|
180
|
+
V1 = "v1"
|
181
|
+
|
182
|
+
|
183
|
+
class ConversionResult(BaseModel):
|
131
184
|
input: InputDocument
|
132
185
|
|
133
186
|
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
@@ -135,16 +188,44 @@ class ConvertedDocument(BaseModel):
|
|
135
188
|
|
136
189
|
pages: List[Page] = []
|
137
190
|
assembled: AssembledUnit = AssembledUnit()
|
191
|
+
timings: Dict[str, ProfilingItem] = {}
|
192
|
+
|
193
|
+
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
194
|
+
|
195
|
+
@property
|
196
|
+
@deprecated("Use document instead.")
|
197
|
+
def legacy_document(self):
|
198
|
+
reverse_label_mapping = {
|
199
|
+
DocItemLabel.CAPTION.value: "Caption",
|
200
|
+
DocItemLabel.FOOTNOTE.value: "Footnote",
|
201
|
+
DocItemLabel.FORMULA.value: "Formula",
|
202
|
+
DocItemLabel.LIST_ITEM.value: "List-item",
|
203
|
+
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
|
204
|
+
DocItemLabel.PAGE_HEADER.value: "Page-header",
|
205
|
+
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
|
206
|
+
DocItemLabel.SECTION_HEADER.value: "Section-header",
|
207
|
+
DocItemLabel.TABLE.value: "Table",
|
208
|
+
DocItemLabel.TEXT.value: "Text",
|
209
|
+
DocItemLabel.TITLE.value: "Title",
|
210
|
+
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
|
211
|
+
DocItemLabel.CODE.value: "Code",
|
212
|
+
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
|
213
|
+
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
|
214
|
+
DocItemLabel.FORM.value: "Form",
|
215
|
+
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
|
216
|
+
DocItemLabel.PARAGRAPH.value: "paragraph",
|
217
|
+
}
|
138
218
|
|
139
|
-
output: DsDocument = _EMPTY_DOC
|
140
|
-
|
141
|
-
def _to_ds_document(self) -> DsDocument:
|
142
219
|
title = ""
|
143
220
|
desc = DsDocumentDescription(logs=[])
|
144
221
|
|
145
222
|
page_hashes = [
|
146
|
-
PageReference(
|
147
|
-
|
223
|
+
PageReference(
|
224
|
+
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
|
225
|
+
page=p.page_no,
|
226
|
+
model="default",
|
227
|
+
)
|
228
|
+
for p in self.document.pages.values()
|
148
229
|
]
|
149
230
|
|
150
231
|
file_info = DsFileInfoObject(
|
@@ -157,145 +238,199 @@ class ConvertedDocument(BaseModel):
|
|
157
238
|
main_text = []
|
158
239
|
tables = []
|
159
240
|
figures = []
|
241
|
+
equations = []
|
242
|
+
footnotes = []
|
243
|
+
page_headers = []
|
244
|
+
page_footers = []
|
245
|
+
|
246
|
+
embedded_captions = set()
|
247
|
+
for ix, (item, level) in enumerate(
|
248
|
+
self.document.iterate_items(self.document.body)
|
249
|
+
):
|
250
|
+
|
251
|
+
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
|
252
|
+
caption = item.caption_text(self.document)
|
253
|
+
if caption:
|
254
|
+
embedded_captions.add(caption)
|
255
|
+
|
256
|
+
for item, level in self.document.iterate_items():
|
257
|
+
if isinstance(item, DocItem):
|
258
|
+
item_type = item.label
|
259
|
+
|
260
|
+
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
|
261
|
+
|
262
|
+
if isinstance(item, ListItem) and item.marker:
|
263
|
+
text = f"{item.marker} {item.text}"
|
264
|
+
else:
|
265
|
+
text = item.text
|
266
|
+
|
267
|
+
# Can be empty.
|
268
|
+
prov = [
|
269
|
+
Prov(
|
270
|
+
bbox=p.bbox.as_tuple(),
|
271
|
+
page=p.page_no,
|
272
|
+
span=[0, len(item.text)],
|
273
|
+
)
|
274
|
+
for p in item.prov
|
275
|
+
]
|
276
|
+
main_text.append(
|
277
|
+
BaseText(
|
278
|
+
text=text,
|
279
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
280
|
+
name=reverse_label_mapping[item.label],
|
281
|
+
prov=prov,
|
282
|
+
)
|
283
|
+
)
|
160
284
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
176
|
-
name=element.label,
|
177
|
-
prov=[
|
178
|
-
Prov(
|
179
|
-
bbox=target_bbox,
|
180
|
-
page=element.page_no + 1,
|
181
|
-
span=[0, len(element.text)],
|
182
|
-
)
|
183
|
-
],
|
285
|
+
# skip captions of they are embedded in the actual
|
286
|
+
# floating object
|
287
|
+
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
288
|
+
continue
|
289
|
+
|
290
|
+
elif isinstance(item, TableItem) and item.data:
|
291
|
+
index = len(tables)
|
292
|
+
ref_str = f"#/tables/{index}"
|
293
|
+
main_text.append(
|
294
|
+
Ref(
|
295
|
+
name=reverse_label_mapping[item.label],
|
296
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
297
|
+
ref=ref_str,
|
298
|
+
),
|
184
299
|
)
|
185
|
-
)
|
186
|
-
elif isinstance(element, TableElement):
|
187
|
-
index = len(tables)
|
188
|
-
ref_str = f"#/tables/{index}"
|
189
|
-
main_text.append(
|
190
|
-
Ref(
|
191
|
-
name=element.label,
|
192
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
193
|
-
ref=ref_str,
|
194
|
-
),
|
195
|
-
)
|
196
300
|
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
301
|
+
# Initialise empty table data grid (only empty cells)
|
302
|
+
table_data = [
|
303
|
+
[
|
304
|
+
TableCell(
|
305
|
+
text="",
|
306
|
+
# bbox=[0,0,0,0],
|
307
|
+
spans=[[i, j]],
|
308
|
+
obj_type="body",
|
309
|
+
)
|
310
|
+
for j in range(item.data.num_cols)
|
311
|
+
]
|
312
|
+
for i in range(item.data.num_rows)
|
207
313
|
]
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
min(cell.start_row_offset_idx, element.num_rows),
|
215
|
-
min(cell.end_row_offset_idx, element.num_rows),
|
216
|
-
):
|
217
|
-
for j in range(
|
218
|
-
min(cell.start_col_offset_idx, element.num_cols),
|
219
|
-
min(cell.end_col_offset_idx, element.num_cols),
|
314
|
+
|
315
|
+
# Overwrite cells in table data for which there is actual cell content.
|
316
|
+
for cell in item.data.table_cells:
|
317
|
+
for i in range(
|
318
|
+
min(cell.start_row_offset_idx, item.data.num_rows),
|
319
|
+
min(cell.end_row_offset_idx, item.data.num_rows),
|
220
320
|
):
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
celltype = "
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
):
|
234
|
-
for
|
321
|
+
for j in range(
|
322
|
+
min(cell.start_col_offset_idx, item.data.num_cols),
|
323
|
+
min(cell.end_col_offset_idx, item.data.num_cols),
|
324
|
+
):
|
325
|
+
celltype = "body"
|
326
|
+
if cell.column_header:
|
327
|
+
celltype = "col_header"
|
328
|
+
elif cell.row_header:
|
329
|
+
celltype = "row_header"
|
330
|
+
elif cell.row_section:
|
331
|
+
celltype = "row_section"
|
332
|
+
|
333
|
+
def make_spans(cell):
|
334
|
+
for rspan in range(
|
335
|
+
min(
|
336
|
+
cell.start_row_offset_idx,
|
337
|
+
item.data.num_rows,
|
338
|
+
),
|
235
339
|
min(
|
236
|
-
cell.
|
340
|
+
cell.end_row_offset_idx, item.data.num_rows
|
237
341
|
),
|
238
|
-
min(cell.end_col_offset_idx, element.num_cols),
|
239
342
|
):
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
343
|
+
for cspan in range(
|
344
|
+
min(
|
345
|
+
cell.start_col_offset_idx,
|
346
|
+
item.data.num_cols,
|
347
|
+
),
|
348
|
+
min(
|
349
|
+
cell.end_col_offset_idx,
|
350
|
+
item.data.num_cols,
|
351
|
+
),
|
352
|
+
):
|
353
|
+
yield [rspan, cspan]
|
354
|
+
|
355
|
+
spans = list(make_spans(cell))
|
356
|
+
table_data[i][j] = GlmTableCell(
|
357
|
+
text=cell.text,
|
358
|
+
bbox=(
|
359
|
+
cell.bbox.as_tuple()
|
360
|
+
if cell.bbox is not None
|
361
|
+
else None
|
362
|
+
), # check if this is bottom-left
|
363
|
+
spans=spans,
|
364
|
+
obj_type=celltype,
|
365
|
+
col=j,
|
366
|
+
row=i,
|
367
|
+
row_header=cell.row_header,
|
368
|
+
row_section=cell.row_section,
|
369
|
+
col_header=cell.column_header,
|
370
|
+
row_span=[
|
371
|
+
cell.start_row_offset_idx,
|
372
|
+
cell.end_row_offset_idx,
|
373
|
+
],
|
374
|
+
col_span=[
|
375
|
+
cell.start_col_offset_idx,
|
376
|
+
cell.end_col_offset_idx,
|
377
|
+
],
|
378
|
+
)
|
379
|
+
|
380
|
+
# Compute the caption
|
381
|
+
caption = item.caption_text(self.document)
|
382
|
+
|
383
|
+
tables.append(
|
384
|
+
DsSchemaTable(
|
385
|
+
text=caption,
|
386
|
+
num_cols=item.data.num_cols,
|
387
|
+
num_rows=item.data.num_rows,
|
388
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
389
|
+
data=table_data,
|
390
|
+
prov=[
|
391
|
+
Prov(
|
392
|
+
bbox=p.bbox.as_tuple(),
|
393
|
+
page=p.page_no,
|
394
|
+
span=[0, 0],
|
395
|
+
)
|
396
|
+
for p in item.prov
|
397
|
+
],
|
398
|
+
)
|
399
|
+
)
|
255
400
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
page=element.page_no + 1,
|
266
|
-
span=[0, 0],
|
267
|
-
)
|
268
|
-
],
|
401
|
+
elif isinstance(item, PictureItem):
|
402
|
+
index = len(figures)
|
403
|
+
ref_str = f"#/figures/{index}"
|
404
|
+
main_text.append(
|
405
|
+
Ref(
|
406
|
+
name=reverse_label_mapping[item.label],
|
407
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
408
|
+
ref=ref_str,
|
409
|
+
),
|
269
410
|
)
|
270
|
-
)
|
271
411
|
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
)
|
290
|
-
],
|
291
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
292
|
-
# data=[[]],
|
412
|
+
# Compute the caption
|
413
|
+
caption = item.caption_text(self.document)
|
414
|
+
|
415
|
+
figures.append(
|
416
|
+
Figure(
|
417
|
+
prov=[
|
418
|
+
Prov(
|
419
|
+
bbox=p.bbox.as_tuple(),
|
420
|
+
page=p.page_no,
|
421
|
+
span=[0, len(caption)],
|
422
|
+
)
|
423
|
+
for p in item.prov
|
424
|
+
],
|
425
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
426
|
+
text=caption,
|
427
|
+
# data=[[]],
|
428
|
+
)
|
293
429
|
)
|
294
|
-
)
|
295
430
|
|
296
431
|
page_dimensions = [
|
297
|
-
PageDimensions(page=p.page_no
|
298
|
-
for p in self.pages
|
432
|
+
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
433
|
+
for p in self.document.pages.values()
|
299
434
|
]
|
300
435
|
|
301
436
|
ds_doc = DsDocument(
|
@@ -303,6 +438,10 @@ class ConvertedDocument(BaseModel):
|
|
303
438
|
description=desc,
|
304
439
|
file_info=file_info,
|
305
440
|
main_text=main_text,
|
441
|
+
equations=equations,
|
442
|
+
footnotes=footnotes,
|
443
|
+
page_headers=page_headers,
|
444
|
+
page_footers=page_footers,
|
306
445
|
tables=tables,
|
307
446
|
figures=figures,
|
308
447
|
page_dimensions=page_dimensions,
|
@@ -310,152 +449,98 @@ class ConvertedDocument(BaseModel):
|
|
310
449
|
|
311
450
|
return ds_doc
|
312
451
|
|
313
|
-
def render_as_dict(self):
|
314
|
-
return self.output.model_dump(by_alias=True, exclude_none=True)
|
315
|
-
|
316
|
-
def render_as_markdown(
|
317
|
-
self,
|
318
|
-
delim: str = "\n\n",
|
319
|
-
main_text_start: int = 0,
|
320
|
-
main_text_stop: Optional[int] = None,
|
321
|
-
main_text_labels: list[str] = [
|
322
|
-
"title",
|
323
|
-
"subtitle-level-1",
|
324
|
-
"paragraph",
|
325
|
-
"caption",
|
326
|
-
"table",
|
327
|
-
"figure",
|
328
|
-
],
|
329
|
-
strict_text: bool = False,
|
330
|
-
image_placeholder: str = "<!-- image -->",
|
331
|
-
):
|
332
|
-
return self.output.export_to_markdown(
|
333
|
-
delim=delim,
|
334
|
-
main_text_start=main_text_start,
|
335
|
-
main_text_stop=main_text_stop,
|
336
|
-
main_text_labels=main_text_labels,
|
337
|
-
strict_text=strict_text,
|
338
|
-
image_placeholder=image_placeholder,
|
339
|
-
)
|
340
452
|
|
341
|
-
|
342
|
-
self,
|
343
|
-
delim: str = "\n\n",
|
344
|
-
main_text_start: int = 0,
|
345
|
-
main_text_stop: Optional[int] = None,
|
346
|
-
main_text_labels: list[str] = [
|
347
|
-
"title",
|
348
|
-
"subtitle-level-1",
|
349
|
-
"paragraph",
|
350
|
-
"caption",
|
351
|
-
],
|
352
|
-
):
|
353
|
-
return self.output.export_to_markdown(
|
354
|
-
delim=delim,
|
355
|
-
main_text_start=main_text_start,
|
356
|
-
main_text_stop=main_text_stop,
|
357
|
-
main_text_labels=main_text_labels,
|
358
|
-
strict_text=True,
|
359
|
-
)
|
360
|
-
|
361
|
-
def render_as_doctags(
|
362
|
-
self,
|
363
|
-
delim: str = "\n\n",
|
364
|
-
main_text_start: int = 0,
|
365
|
-
main_text_stop: Optional[int] = None,
|
366
|
-
main_text_labels: list[str] = [
|
367
|
-
"title",
|
368
|
-
"subtitle-level-1",
|
369
|
-
"paragraph",
|
370
|
-
"caption",
|
371
|
-
"table",
|
372
|
-
"figure",
|
373
|
-
],
|
374
|
-
xsize: int = 100,
|
375
|
-
ysize: int = 100,
|
376
|
-
add_location: bool = True,
|
377
|
-
add_content: bool = True,
|
378
|
-
add_page_index: bool = True,
|
379
|
-
# table specific flags
|
380
|
-
add_table_cell_location: bool = False,
|
381
|
-
add_table_cell_label: bool = True,
|
382
|
-
add_table_cell_text: bool = True,
|
383
|
-
) -> str:
|
384
|
-
return self.output.export_to_document_tokens(
|
385
|
-
delim=delim,
|
386
|
-
main_text_start=main_text_start,
|
387
|
-
main_text_stop=main_text_stop,
|
388
|
-
main_text_labels=main_text_labels,
|
389
|
-
xsize=xsize,
|
390
|
-
ysize=ysize,
|
391
|
-
add_location=add_location,
|
392
|
-
add_content=add_content,
|
393
|
-
add_page_index=add_page_index,
|
394
|
-
# table specific flags
|
395
|
-
add_table_cell_location=add_table_cell_location,
|
396
|
-
add_table_cell_label=add_table_cell_label,
|
397
|
-
add_table_cell_text=add_table_cell_text,
|
398
|
-
)
|
453
|
+
class _DocumentConversionInput(BaseModel):
|
399
454
|
|
400
|
-
|
401
|
-
self, element_types: Tuple[PageElement] = (FigureElement,)
|
402
|
-
):
|
403
|
-
for element in self.assembled.elements:
|
404
|
-
if isinstance(element, element_types):
|
405
|
-
page_ix = element.page_no
|
406
|
-
scale = self.pages[page_ix]._default_image_scale
|
407
|
-
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
|
408
|
-
page_height=self.pages[page_ix].size.height * scale
|
409
|
-
)
|
410
|
-
|
411
|
-
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
|
412
|
-
yield element, cropped_im
|
413
|
-
|
414
|
-
|
415
|
-
class ConversionResult(ConvertedDocument):
|
416
|
-
pass
|
417
|
-
|
418
|
-
|
419
|
-
class DocumentConversionInput(BaseModel):
|
420
|
-
|
421
|
-
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
455
|
+
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
422
456
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
423
457
|
|
424
|
-
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
|
425
|
-
|
426
458
|
def docs(
|
427
|
-
self,
|
459
|
+
self, format_options: Dict[InputFormat, "FormatOption"]
|
428
460
|
) -> Iterable[InputDocument]:
|
461
|
+
for item in self.path_or_stream_iterator:
|
462
|
+
obj = resolve_file_source(item) if isinstance(item, str) else item
|
463
|
+
format = self._guess_format(obj)
|
464
|
+
if format not in format_options.keys():
|
465
|
+
_log.info(
|
466
|
+
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
467
|
+
)
|
468
|
+
continue
|
469
|
+
else:
|
470
|
+
backend = format_options[format].backend
|
429
471
|
|
430
|
-
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
|
431
|
-
|
432
|
-
for obj in self._path_or_stream_iterator:
|
433
472
|
if isinstance(obj, Path):
|
434
473
|
yield InputDocument(
|
435
|
-
path_or_stream=obj,
|
474
|
+
path_or_stream=obj,
|
475
|
+
format=format,
|
476
|
+
filename=obj.name,
|
477
|
+
limits=self.limits,
|
478
|
+
backend=backend,
|
436
479
|
)
|
437
480
|
elif isinstance(obj, DocumentStream):
|
438
481
|
yield InputDocument(
|
439
482
|
path_or_stream=obj.stream,
|
440
|
-
|
483
|
+
format=format,
|
484
|
+
filename=obj.name,
|
441
485
|
limits=self.limits,
|
442
|
-
|
486
|
+
backend=backend,
|
487
|
+
)
|
488
|
+
else:
|
489
|
+
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
490
|
+
|
491
|
+
def _guess_format(self, obj: Union[Path, DocumentStream]):
|
492
|
+
content = b"" # empty binary blob
|
493
|
+
format = None
|
494
|
+
|
495
|
+
if isinstance(obj, Path):
|
496
|
+
mime = filetype.guess_mime(str(obj))
|
497
|
+
if mime is None:
|
498
|
+
ext = obj.suffix[1:]
|
499
|
+
mime = self._mime_from_extension(ext)
|
500
|
+
if mime is None: # must guess from
|
501
|
+
with obj.open("rb") as f:
|
502
|
+
content = f.read(1024) # Read first 1KB
|
503
|
+
|
504
|
+
elif isinstance(obj, DocumentStream):
|
505
|
+
content = obj.stream.read(8192)
|
506
|
+
obj.stream.seek(0)
|
507
|
+
mime = filetype.guess_mime(content)
|
508
|
+
if mime is None:
|
509
|
+
ext = (
|
510
|
+
obj.name.rsplit(".", 1)[-1]
|
511
|
+
if ("." in obj.name and not obj.name.startswith("."))
|
512
|
+
else ""
|
443
513
|
)
|
514
|
+
mime = self._mime_from_extension(ext)
|
444
515
|
|
445
|
-
|
446
|
-
|
447
|
-
paths = [Path(p) for p in paths]
|
516
|
+
mime = mime or self._detect_html_xhtml(content)
|
517
|
+
mime = mime or "text/plain"
|
448
518
|
|
449
|
-
|
450
|
-
|
519
|
+
format = MimeTypeToFormat.get(mime)
|
520
|
+
return format
|
451
521
|
|
452
|
-
|
522
|
+
def _mime_from_extension(self, ext):
|
523
|
+
mime = None
|
524
|
+
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
|
525
|
+
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
|
526
|
+
elif ext in FormatToExtensions[InputFormat.HTML]:
|
527
|
+
mime = FormatToMimeType[InputFormat.HTML][0]
|
528
|
+
elif ext in FormatToExtensions[InputFormat.MD]:
|
529
|
+
mime = FormatToMimeType[InputFormat.MD][0]
|
453
530
|
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
531
|
+
return mime
|
532
|
+
|
533
|
+
def _detect_html_xhtml(self, content):
|
534
|
+
content_str = content.decode("ascii", errors="ignore").lower()
|
535
|
+
# Remove XML comments
|
536
|
+
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
|
537
|
+
content_str = content_str.lstrip()
|
538
|
+
|
539
|
+
if re.match(r"<\?xml", content_str):
|
540
|
+
if "xhtml" in content_str[:1000]:
|
541
|
+
return "application/xhtml+xml"
|
542
|
+
|
543
|
+
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
544
|
+
return "text/html"
|
460
545
|
|
461
|
-
return
|
546
|
+
return None
|