docling 1.20.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +32 -37
- docling/backend/docling_parse_backend.py +16 -12
- docling/backend/docling_parse_v2_backend.py +15 -11
- docling/backend/html_backend.py +425 -0
- docling/backend/mspowerpoint_backend.py +375 -0
- docling/backend/msword_backend.py +509 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +15 -10
- docling/cli/main.py +61 -60
- docling/datamodel/base_models.py +73 -193
- docling/datamodel/document.py +364 -318
- docling/datamodel/pipeline_options.py +13 -0
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +215 -252
- docling/models/base_model.py +25 -0
- docling/models/base_ocr_model.py +10 -5
- docling/models/ds_glm_model.py +209 -20
- docling/models/easyocr_model.py +4 -1
- docling/models/layout_model.py +73 -61
- docling/models/page_assemble_model.py +21 -5
- docling/models/page_preprocessing_model.py +57 -0
- docling/models/table_structure_model.py +34 -32
- docling/models/tesseract_ocr_cli_model.py +8 -5
- docling/models/tesseract_ocr_model.py +8 -5
- docling/pipeline/base_pipeline.py +190 -0
- docling/pipeline/simple_pipeline.py +59 -0
- docling/pipeline/standard_pdf_pipeline.py +198 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling-2.0.0.dist-info/METADATA +149 -0
- docling-2.0.0.dist-info/RECORD +42 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.20.0.dist-info/METADATA +0 -380
- docling-1.20.0.dist-info/RECORD +0 -35
- {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
- {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
- {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0
docling/datamodel/document.py
CHANGED
@@ -1,87 +1,101 @@
|
|
1
1
|
import logging
|
2
|
+
import re
|
3
|
+
from enum import Enum
|
2
4
|
from io import BytesIO
|
3
5
|
from pathlib import Path, PurePath
|
4
|
-
from typing import
|
6
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
|
5
7
|
|
6
|
-
|
8
|
+
import filetype
|
9
|
+
from docling_core.types import BaseText
|
7
10
|
from docling_core.types import Document as DsDocument
|
8
11
|
from docling_core.types import DocumentDescription as DsDocumentDescription
|
9
12
|
from docling_core.types import FileInfoObject as DsFileInfoObject
|
10
13
|
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
11
14
|
from docling_core.types import Table as DsSchemaTable
|
12
|
-
from docling_core.types import
|
13
|
-
|
14
|
-
|
15
|
+
from docling_core.types.doc import (
|
16
|
+
DocItem,
|
17
|
+
DocItemLabel,
|
18
|
+
DoclingDocument,
|
19
|
+
PictureItem,
|
20
|
+
SectionHeaderItem,
|
21
|
+
TableItem,
|
22
|
+
TextItem,
|
23
|
+
)
|
24
|
+
from docling_core.types.doc.document import ListItem
|
25
|
+
from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
|
26
|
+
from docling_core.utils.file import resolve_file_source
|
15
27
|
from pydantic import BaseModel
|
16
28
|
from typing_extensions import deprecated
|
17
29
|
|
18
|
-
from docling.backend.abstract_backend import
|
19
|
-
|
30
|
+
from docling.backend.abstract_backend import (
|
31
|
+
AbstractDocumentBackend,
|
32
|
+
PaginatedDocumentBackend,
|
33
|
+
)
|
20
34
|
from docling.datamodel.base_models import (
|
21
35
|
AssembledUnit,
|
22
36
|
ConversionStatus,
|
23
37
|
DocumentStream,
|
24
38
|
ErrorItem,
|
25
|
-
|
39
|
+
InputFormat,
|
40
|
+
MimeTypeToFormat,
|
26
41
|
Page,
|
27
|
-
PageElement,
|
28
|
-
TableElement,
|
29
|
-
TextElement,
|
30
42
|
)
|
31
43
|
from docling.datamodel.settings import DocumentLimits
|
32
|
-
from docling.utils.utils import create_file_hash
|
44
|
+
from docling.utils.utils import create_file_hash, create_hash
|
45
|
+
|
46
|
+
if TYPE_CHECKING:
|
47
|
+
from docling.document_converter import FormatOption
|
33
48
|
|
34
49
|
_log = logging.getLogger(__name__)
|
35
50
|
|
36
51
|
layout_label_to_ds_type = {
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
+
DocItemLabel.TITLE: "title",
|
53
|
+
DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
|
54
|
+
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
|
55
|
+
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
|
56
|
+
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
|
57
|
+
DocItemLabel.CAPTION: "caption",
|
58
|
+
DocItemLabel.PAGE_HEADER: "page-header",
|
59
|
+
DocItemLabel.PAGE_FOOTER: "page-footer",
|
60
|
+
DocItemLabel.FOOTNOTE: "footnote",
|
61
|
+
DocItemLabel.TABLE: "table",
|
62
|
+
DocItemLabel.FORMULA: "equation",
|
63
|
+
DocItemLabel.LIST_ITEM: "paragraph",
|
64
|
+
DocItemLabel.CODE: "paragraph",
|
65
|
+
DocItemLabel.PICTURE: "figure",
|
66
|
+
DocItemLabel.TEXT: "paragraph",
|
67
|
+
DocItemLabel.PARAGRAPH: "paragraph",
|
52
68
|
}
|
53
69
|
|
54
|
-
|
55
|
-
_name="",
|
56
|
-
description=DsDocumentDescription(logs=[]),
|
57
|
-
file_info=DsFileInfoObject(
|
58
|
-
filename="",
|
59
|
-
document_hash="",
|
60
|
-
),
|
61
|
-
)
|
70
|
+
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
62
71
|
|
63
72
|
|
64
73
|
class InputDocument(BaseModel):
|
65
|
-
file: PurePath
|
66
|
-
document_hash:
|
67
|
-
valid: bool =
|
74
|
+
file: PurePath
|
75
|
+
document_hash: str # = None
|
76
|
+
valid: bool = True
|
68
77
|
limits: DocumentLimits = DocumentLimits()
|
78
|
+
format: InputFormat # = None
|
69
79
|
|
70
80
|
filesize: Optional[int] = None
|
71
|
-
page_count:
|
81
|
+
page_count: int = 0
|
72
82
|
|
73
|
-
_backend:
|
83
|
+
_backend: AbstractDocumentBackend # Internal PDF backend used
|
74
84
|
|
75
85
|
def __init__(
|
76
86
|
self,
|
77
87
|
path_or_stream: Union[BytesIO, Path],
|
88
|
+
format: InputFormat,
|
89
|
+
backend: Type[AbstractDocumentBackend],
|
78
90
|
filename: Optional[str] = None,
|
79
91
|
limits: Optional[DocumentLimits] = None,
|
80
|
-
pdf_backend=DoclingParseDocumentBackend,
|
81
92
|
):
|
82
|
-
super().__init__(
|
93
|
+
super().__init__(
|
94
|
+
file="", document_hash="", format=InputFormat.PDF
|
95
|
+
) # initialize with dummy values
|
83
96
|
|
84
97
|
self.limits = limits or DocumentLimits()
|
98
|
+
self.format = format
|
85
99
|
|
86
100
|
try:
|
87
101
|
if isinstance(path_or_stream, Path):
|
@@ -91,11 +105,12 @@ class InputDocument(BaseModel):
|
|
91
105
|
self.valid = False
|
92
106
|
else:
|
93
107
|
self.document_hash = create_file_hash(path_or_stream)
|
94
|
-
self.
|
95
|
-
path_or_stream=path_or_stream, document_hash=self.document_hash
|
96
|
-
)
|
108
|
+
self._init_doc(backend, path_or_stream)
|
97
109
|
|
98
110
|
elif isinstance(path_or_stream, BytesIO):
|
111
|
+
assert (
|
112
|
+
filename is not None
|
113
|
+
), "Can't construct InputDocument from stream without providing filename arg."
|
99
114
|
self.file = PurePath(filename)
|
100
115
|
self.filesize = path_or_stream.getbuffer().nbytes
|
101
116
|
|
@@ -103,15 +118,20 @@ class InputDocument(BaseModel):
|
|
103
118
|
self.valid = False
|
104
119
|
else:
|
105
120
|
self.document_hash = create_file_hash(path_or_stream)
|
106
|
-
self.
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
self.page_count = self._backend.page_count()
|
121
|
+
self._init_doc(backend, path_or_stream)
|
122
|
+
else:
|
123
|
+
raise RuntimeError(
|
124
|
+
f"Unexpected type path_or_stream: {type(path_or_stream)}"
|
125
|
+
)
|
112
126
|
|
113
|
-
|
114
|
-
|
127
|
+
# For paginated backends, check if the maximum page count is exceeded.
|
128
|
+
if self.valid and self._backend.is_valid():
|
129
|
+
if self._backend.supports_pagination() and isinstance(
|
130
|
+
self._backend, PaginatedDocumentBackend
|
131
|
+
):
|
132
|
+
self.page_count = self._backend.page_count()
|
133
|
+
if not self.page_count <= self.limits.max_num_pages:
|
134
|
+
self.valid = False
|
115
135
|
|
116
136
|
except (FileNotFoundError, OSError) as e:
|
117
137
|
_log.exception(
|
@@ -125,9 +145,26 @@ class InputDocument(BaseModel):
|
|
125
145
|
)
|
126
146
|
# raise
|
127
147
|
|
148
|
+
def _init_doc(
|
149
|
+
self,
|
150
|
+
backend: Type[AbstractDocumentBackend],
|
151
|
+
path_or_stream: Union[BytesIO, Path],
|
152
|
+
) -> None:
|
153
|
+
if backend is None:
|
154
|
+
raise RuntimeError(
|
155
|
+
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
156
|
+
f"Please check your format configuration on DocumentConverter."
|
157
|
+
)
|
158
|
+
|
159
|
+
self._backend = backend(self, path_or_stream=path_or_stream)
|
160
|
+
|
128
161
|
|
129
|
-
|
130
|
-
|
162
|
+
class DocumentFormat(str, Enum):
|
163
|
+
V2 = "v2"
|
164
|
+
V1 = "v1"
|
165
|
+
|
166
|
+
|
167
|
+
class ConversionResult(BaseModel):
|
131
168
|
input: InputDocument
|
132
169
|
|
133
170
|
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
@@ -136,15 +173,42 @@ class ConvertedDocument(BaseModel):
|
|
136
173
|
pages: List[Page] = []
|
137
174
|
assembled: AssembledUnit = AssembledUnit()
|
138
175
|
|
139
|
-
|
176
|
+
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
177
|
+
|
178
|
+
@property
|
179
|
+
@deprecated("Use document instead.")
|
180
|
+
def legacy_document(self):
|
181
|
+
reverse_label_mapping = {
|
182
|
+
DocItemLabel.CAPTION.value: "Caption",
|
183
|
+
DocItemLabel.FOOTNOTE.value: "Footnote",
|
184
|
+
DocItemLabel.FORMULA.value: "Formula",
|
185
|
+
DocItemLabel.LIST_ITEM.value: "List-item",
|
186
|
+
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
|
187
|
+
DocItemLabel.PAGE_HEADER.value: "Page-header",
|
188
|
+
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
|
189
|
+
DocItemLabel.SECTION_HEADER.value: "Section-header",
|
190
|
+
DocItemLabel.TABLE.value: "Table",
|
191
|
+
DocItemLabel.TEXT.value: "Text",
|
192
|
+
DocItemLabel.TITLE.value: "Title",
|
193
|
+
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
|
194
|
+
DocItemLabel.CODE.value: "Code",
|
195
|
+
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
|
196
|
+
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
|
197
|
+
DocItemLabel.FORM.value: "Form",
|
198
|
+
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
|
199
|
+
DocItemLabel.PARAGRAPH.value: "paragraph",
|
200
|
+
}
|
140
201
|
|
141
|
-
def _to_ds_document(self) -> DsDocument:
|
142
202
|
title = ""
|
143
203
|
desc = DsDocumentDescription(logs=[])
|
144
204
|
|
145
205
|
page_hashes = [
|
146
|
-
PageReference(
|
147
|
-
|
206
|
+
PageReference(
|
207
|
+
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
|
208
|
+
page=p.page_no,
|
209
|
+
model="default",
|
210
|
+
)
|
211
|
+
for p in self.document.pages.values()
|
148
212
|
]
|
149
213
|
|
150
214
|
file_info = DsFileInfoObject(
|
@@ -157,145 +221,199 @@ class ConvertedDocument(BaseModel):
|
|
157
221
|
main_text = []
|
158
222
|
tables = []
|
159
223
|
figures = []
|
224
|
+
equations = []
|
225
|
+
footnotes = []
|
226
|
+
page_headers = []
|
227
|
+
page_footers = []
|
228
|
+
|
229
|
+
embedded_captions = set()
|
230
|
+
for ix, (item, level) in enumerate(
|
231
|
+
self.document.iterate_items(self.document.body)
|
232
|
+
):
|
233
|
+
|
234
|
+
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
|
235
|
+
caption = item.caption_text(self.document)
|
236
|
+
if caption:
|
237
|
+
embedded_captions.add(caption)
|
238
|
+
|
239
|
+
for item, level in self.document.iterate_items():
|
240
|
+
if isinstance(item, DocItem):
|
241
|
+
item_type = item.label
|
242
|
+
|
243
|
+
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
|
244
|
+
|
245
|
+
if isinstance(item, ListItem) and item.marker:
|
246
|
+
text = f"{item.marker} {item.text}"
|
247
|
+
else:
|
248
|
+
text = item.text
|
249
|
+
|
250
|
+
# Can be empty.
|
251
|
+
prov = [
|
252
|
+
Prov(
|
253
|
+
bbox=p.bbox.as_tuple(),
|
254
|
+
page=p.page_no,
|
255
|
+
span=[0, len(item.text)],
|
256
|
+
)
|
257
|
+
for p in item.prov
|
258
|
+
]
|
259
|
+
main_text.append(
|
260
|
+
BaseText(
|
261
|
+
text=text,
|
262
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
263
|
+
name=reverse_label_mapping[item.label],
|
264
|
+
prov=prov,
|
265
|
+
)
|
266
|
+
)
|
160
267
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
176
|
-
name=element.label,
|
177
|
-
prov=[
|
178
|
-
Prov(
|
179
|
-
bbox=target_bbox,
|
180
|
-
page=element.page_no + 1,
|
181
|
-
span=[0, len(element.text)],
|
182
|
-
)
|
183
|
-
],
|
268
|
+
# skip captions of they are embedded in the actual
|
269
|
+
# floating object
|
270
|
+
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
271
|
+
continue
|
272
|
+
|
273
|
+
elif isinstance(item, TableItem) and item.data:
|
274
|
+
index = len(tables)
|
275
|
+
ref_str = f"#/tables/{index}"
|
276
|
+
main_text.append(
|
277
|
+
Ref(
|
278
|
+
name=reverse_label_mapping[item.label],
|
279
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
280
|
+
ref=ref_str,
|
281
|
+
),
|
184
282
|
)
|
185
|
-
)
|
186
|
-
elif isinstance(element, TableElement):
|
187
|
-
index = len(tables)
|
188
|
-
ref_str = f"#/tables/{index}"
|
189
|
-
main_text.append(
|
190
|
-
Ref(
|
191
|
-
name=element.label,
|
192
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
193
|
-
ref=ref_str,
|
194
|
-
),
|
195
|
-
)
|
196
283
|
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
284
|
+
# Initialise empty table data grid (only empty cells)
|
285
|
+
table_data = [
|
286
|
+
[
|
287
|
+
TableCell(
|
288
|
+
text="",
|
289
|
+
# bbox=[0,0,0,0],
|
290
|
+
spans=[[i, j]],
|
291
|
+
obj_type="body",
|
292
|
+
)
|
293
|
+
for j in range(item.data.num_cols)
|
294
|
+
]
|
295
|
+
for i in range(item.data.num_rows)
|
207
296
|
]
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
min(cell.start_row_offset_idx, element.num_rows),
|
215
|
-
min(cell.end_row_offset_idx, element.num_rows),
|
216
|
-
):
|
217
|
-
for j in range(
|
218
|
-
min(cell.start_col_offset_idx, element.num_cols),
|
219
|
-
min(cell.end_col_offset_idx, element.num_cols),
|
297
|
+
|
298
|
+
# Overwrite cells in table data for which there is actual cell content.
|
299
|
+
for cell in item.data.table_cells:
|
300
|
+
for i in range(
|
301
|
+
min(cell.start_row_offset_idx, item.data.num_rows),
|
302
|
+
min(cell.end_row_offset_idx, item.data.num_rows),
|
220
303
|
):
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
celltype = "
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
):
|
234
|
-
for
|
304
|
+
for j in range(
|
305
|
+
min(cell.start_col_offset_idx, item.data.num_cols),
|
306
|
+
min(cell.end_col_offset_idx, item.data.num_cols),
|
307
|
+
):
|
308
|
+
celltype = "body"
|
309
|
+
if cell.column_header:
|
310
|
+
celltype = "col_header"
|
311
|
+
elif cell.row_header:
|
312
|
+
celltype = "row_header"
|
313
|
+
elif cell.row_section:
|
314
|
+
celltype = "row_section"
|
315
|
+
|
316
|
+
def make_spans(cell):
|
317
|
+
for rspan in range(
|
318
|
+
min(
|
319
|
+
cell.start_row_offset_idx,
|
320
|
+
item.data.num_rows,
|
321
|
+
),
|
235
322
|
min(
|
236
|
-
cell.
|
323
|
+
cell.end_row_offset_idx, item.data.num_rows
|
237
324
|
),
|
238
|
-
min(cell.end_col_offset_idx, element.num_cols),
|
239
325
|
):
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
326
|
+
for cspan in range(
|
327
|
+
min(
|
328
|
+
cell.start_col_offset_idx,
|
329
|
+
item.data.num_cols,
|
330
|
+
),
|
331
|
+
min(
|
332
|
+
cell.end_col_offset_idx,
|
333
|
+
item.data.num_cols,
|
334
|
+
),
|
335
|
+
):
|
336
|
+
yield [rspan, cspan]
|
337
|
+
|
338
|
+
spans = list(make_spans(cell))
|
339
|
+
table_data[i][j] = GlmTableCell(
|
340
|
+
text=cell.text,
|
341
|
+
bbox=(
|
342
|
+
cell.bbox.as_tuple()
|
343
|
+
if cell.bbox is not None
|
344
|
+
else None
|
345
|
+
), # check if this is bottom-left
|
346
|
+
spans=spans,
|
347
|
+
obj_type=celltype,
|
348
|
+
col=j,
|
349
|
+
row=i,
|
350
|
+
row_header=cell.row_header,
|
351
|
+
row_section=cell.row_section,
|
352
|
+
col_header=cell.column_header,
|
353
|
+
row_span=[
|
354
|
+
cell.start_row_offset_idx,
|
355
|
+
cell.end_row_offset_idx,
|
356
|
+
],
|
357
|
+
col_span=[
|
358
|
+
cell.start_col_offset_idx,
|
359
|
+
cell.end_col_offset_idx,
|
360
|
+
],
|
361
|
+
)
|
362
|
+
|
363
|
+
# Compute the caption
|
364
|
+
caption = item.caption_text(self.document)
|
365
|
+
|
366
|
+
tables.append(
|
367
|
+
DsSchemaTable(
|
368
|
+
text=caption,
|
369
|
+
num_cols=item.data.num_cols,
|
370
|
+
num_rows=item.data.num_rows,
|
371
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
372
|
+
data=table_data,
|
373
|
+
prov=[
|
374
|
+
Prov(
|
375
|
+
bbox=p.bbox.as_tuple(),
|
376
|
+
page=p.page_no,
|
377
|
+
span=[0, 0],
|
378
|
+
)
|
379
|
+
for p in item.prov
|
380
|
+
],
|
381
|
+
)
|
382
|
+
)
|
255
383
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
page=element.page_no + 1,
|
266
|
-
span=[0, 0],
|
267
|
-
)
|
268
|
-
],
|
384
|
+
elif isinstance(item, PictureItem):
|
385
|
+
index = len(figures)
|
386
|
+
ref_str = f"#/figures/{index}"
|
387
|
+
main_text.append(
|
388
|
+
Ref(
|
389
|
+
name=reverse_label_mapping[item.label],
|
390
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
391
|
+
ref=ref_str,
|
392
|
+
),
|
269
393
|
)
|
270
|
-
)
|
271
394
|
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
)
|
290
|
-
],
|
291
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
292
|
-
# data=[[]],
|
395
|
+
# Compute the caption
|
396
|
+
caption = item.caption_text(self.document)
|
397
|
+
|
398
|
+
figures.append(
|
399
|
+
Figure(
|
400
|
+
prov=[
|
401
|
+
Prov(
|
402
|
+
bbox=p.bbox.as_tuple(),
|
403
|
+
page=p.page_no,
|
404
|
+
span=[0, len(caption)],
|
405
|
+
)
|
406
|
+
for p in item.prov
|
407
|
+
],
|
408
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
409
|
+
text=caption,
|
410
|
+
# data=[[]],
|
411
|
+
)
|
293
412
|
)
|
294
|
-
)
|
295
413
|
|
296
414
|
page_dimensions = [
|
297
|
-
PageDimensions(page=p.page_no
|
298
|
-
for p in self.pages
|
415
|
+
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
416
|
+
for p in self.document.pages.values()
|
299
417
|
]
|
300
418
|
|
301
419
|
ds_doc = DsDocument(
|
@@ -303,6 +421,10 @@ class ConvertedDocument(BaseModel):
|
|
303
421
|
description=desc,
|
304
422
|
file_info=file_info,
|
305
423
|
main_text=main_text,
|
424
|
+
equations=equations,
|
425
|
+
footnotes=footnotes,
|
426
|
+
page_headers=page_headers,
|
427
|
+
page_footers=page_footers,
|
306
428
|
tables=tables,
|
307
429
|
figures=figures,
|
308
430
|
page_dimensions=page_dimensions,
|
@@ -310,152 +432,76 @@ class ConvertedDocument(BaseModel):
|
|
310
432
|
|
311
433
|
return ds_doc
|
312
434
|
|
313
|
-
def render_as_dict(self):
|
314
|
-
return self.output.model_dump(by_alias=True, exclude_none=True)
|
315
|
-
|
316
|
-
def render_as_markdown(
|
317
|
-
self,
|
318
|
-
delim: str = "\n\n",
|
319
|
-
main_text_start: int = 0,
|
320
|
-
main_text_stop: Optional[int] = None,
|
321
|
-
main_text_labels: list[str] = [
|
322
|
-
"title",
|
323
|
-
"subtitle-level-1",
|
324
|
-
"paragraph",
|
325
|
-
"caption",
|
326
|
-
"table",
|
327
|
-
"figure",
|
328
|
-
],
|
329
|
-
strict_text: bool = False,
|
330
|
-
image_placeholder: str = "<!-- image -->",
|
331
|
-
):
|
332
|
-
return self.output.export_to_markdown(
|
333
|
-
delim=delim,
|
334
|
-
main_text_start=main_text_start,
|
335
|
-
main_text_stop=main_text_stop,
|
336
|
-
main_text_labels=main_text_labels,
|
337
|
-
strict_text=strict_text,
|
338
|
-
image_placeholder=image_placeholder,
|
339
|
-
)
|
340
|
-
|
341
|
-
def render_as_text(
|
342
|
-
self,
|
343
|
-
delim: str = "\n\n",
|
344
|
-
main_text_start: int = 0,
|
345
|
-
main_text_stop: Optional[int] = None,
|
346
|
-
main_text_labels: list[str] = [
|
347
|
-
"title",
|
348
|
-
"subtitle-level-1",
|
349
|
-
"paragraph",
|
350
|
-
"caption",
|
351
|
-
],
|
352
|
-
):
|
353
|
-
return self.output.export_to_markdown(
|
354
|
-
delim=delim,
|
355
|
-
main_text_start=main_text_start,
|
356
|
-
main_text_stop=main_text_stop,
|
357
|
-
main_text_labels=main_text_labels,
|
358
|
-
strict_text=True,
|
359
|
-
)
|
360
|
-
|
361
|
-
def render_as_doctags(
|
362
|
-
self,
|
363
|
-
delim: str = "\n\n",
|
364
|
-
main_text_start: int = 0,
|
365
|
-
main_text_stop: Optional[int] = None,
|
366
|
-
main_text_labels: list[str] = [
|
367
|
-
"title",
|
368
|
-
"subtitle-level-1",
|
369
|
-
"paragraph",
|
370
|
-
"caption",
|
371
|
-
"table",
|
372
|
-
"figure",
|
373
|
-
],
|
374
|
-
xsize: int = 100,
|
375
|
-
ysize: int = 100,
|
376
|
-
add_location: bool = True,
|
377
|
-
add_content: bool = True,
|
378
|
-
add_page_index: bool = True,
|
379
|
-
# table specific flags
|
380
|
-
add_table_cell_location: bool = False,
|
381
|
-
add_table_cell_label: bool = True,
|
382
|
-
add_table_cell_text: bool = True,
|
383
|
-
) -> str:
|
384
|
-
return self.output.export_to_document_tokens(
|
385
|
-
delim=delim,
|
386
|
-
main_text_start=main_text_start,
|
387
|
-
main_text_stop=main_text_stop,
|
388
|
-
main_text_labels=main_text_labels,
|
389
|
-
xsize=xsize,
|
390
|
-
ysize=ysize,
|
391
|
-
add_location=add_location,
|
392
|
-
add_content=add_content,
|
393
|
-
add_page_index=add_page_index,
|
394
|
-
# table specific flags
|
395
|
-
add_table_cell_location=add_table_cell_location,
|
396
|
-
add_table_cell_label=add_table_cell_label,
|
397
|
-
add_table_cell_text=add_table_cell_text,
|
398
|
-
)
|
399
|
-
|
400
|
-
def render_element_images(
|
401
|
-
self, element_types: Tuple[PageElement] = (FigureElement,)
|
402
|
-
):
|
403
|
-
for element in self.assembled.elements:
|
404
|
-
if isinstance(element, element_types):
|
405
|
-
page_ix = element.page_no
|
406
|
-
scale = self.pages[page_ix]._default_image_scale
|
407
|
-
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
|
408
|
-
page_height=self.pages[page_ix].size.height * scale
|
409
|
-
)
|
410
|
-
|
411
|
-
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
|
412
|
-
yield element, cropped_im
|
413
|
-
|
414
435
|
|
415
|
-
class
|
416
|
-
pass
|
436
|
+
class _DocumentConversionInput(BaseModel):
|
417
437
|
|
418
|
-
|
419
|
-
class DocumentConversionInput(BaseModel):
|
420
|
-
|
421
|
-
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
438
|
+
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
422
439
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
423
440
|
|
424
|
-
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
|
425
|
-
|
426
441
|
def docs(
|
427
|
-
self,
|
442
|
+
self, format_options: Dict[InputFormat, "FormatOption"]
|
428
443
|
) -> Iterable[InputDocument]:
|
444
|
+
for item in self.path_or_stream_iterator:
|
445
|
+
obj = resolve_file_source(item) if isinstance(item, str) else item
|
446
|
+
format = self._guess_format(obj)
|
447
|
+
if format not in format_options.keys():
|
448
|
+
_log.info(
|
449
|
+
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
450
|
+
)
|
451
|
+
continue
|
452
|
+
else:
|
453
|
+
backend = format_options[format].backend
|
429
454
|
|
430
|
-
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
|
431
|
-
|
432
|
-
for obj in self._path_or_stream_iterator:
|
433
455
|
if isinstance(obj, Path):
|
434
456
|
yield InputDocument(
|
435
|
-
path_or_stream=obj,
|
457
|
+
path_or_stream=obj,
|
458
|
+
format=format,
|
459
|
+
filename=obj.name,
|
460
|
+
limits=self.limits,
|
461
|
+
backend=backend,
|
436
462
|
)
|
437
463
|
elif isinstance(obj, DocumentStream):
|
438
464
|
yield InputDocument(
|
439
465
|
path_or_stream=obj.stream,
|
440
|
-
|
466
|
+
format=format,
|
467
|
+
filename=obj.name,
|
441
468
|
limits=self.limits,
|
442
|
-
|
469
|
+
backend=backend,
|
443
470
|
)
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
471
|
+
else:
|
472
|
+
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
473
|
+
|
474
|
+
def _guess_format(self, obj):
|
475
|
+
content = None
|
476
|
+
if isinstance(obj, Path):
|
477
|
+
mime = filetype.guess_mime(str(obj))
|
478
|
+
if mime is None:
|
479
|
+
with obj.open("rb") as f:
|
480
|
+
content = f.read(1024) # Read first 1KB
|
481
|
+
|
482
|
+
elif isinstance(obj, DocumentStream):
|
483
|
+
obj.stream.seek(0)
|
484
|
+
content = obj.stream.read(8192)
|
485
|
+
obj.stream.seek(0)
|
486
|
+
mime = filetype.guess_mime(content)
|
487
|
+
|
488
|
+
if mime is None:
|
489
|
+
mime = self._detect_html_xhtml(content)
|
490
|
+
|
491
|
+
format = MimeTypeToFormat.get(mime)
|
492
|
+
return format
|
493
|
+
|
494
|
+
def _detect_html_xhtml(self, content):
|
495
|
+
content_str = content.decode("ascii", errors="ignore").lower()
|
496
|
+
# Remove XML comments
|
497
|
+
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
|
498
|
+
content_str = content_str.lstrip()
|
499
|
+
|
500
|
+
if re.match(r"<\?xml", content_str):
|
501
|
+
if "xhtml" in content_str[:1000]:
|
502
|
+
return "application/xhtml+xml"
|
503
|
+
|
504
|
+
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
505
|
+
return "text/html"
|
506
|
+
|
507
|
+
return None
|