docling 1.19.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +32 -37
- docling/backend/docling_parse_backend.py +16 -12
- docling/backend/docling_parse_v2_backend.py +240 -0
- docling/backend/html_backend.py +425 -0
- docling/backend/mspowerpoint_backend.py +375 -0
- docling/backend/msword_backend.py +509 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +15 -10
- docling/cli/main.py +61 -60
- docling/datamodel/base_models.py +73 -193
- docling/datamodel/document.py +379 -324
- docling/datamodel/pipeline_options.py +16 -0
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +215 -252
- docling/models/base_model.py +25 -0
- docling/models/base_ocr_model.py +19 -6
- docling/models/ds_glm_model.py +220 -22
- docling/models/easyocr_model.py +45 -40
- docling/models/layout_model.py +130 -114
- docling/models/page_assemble_model.py +119 -95
- docling/models/page_preprocessing_model.py +61 -0
- docling/models/table_structure_model.py +122 -111
- docling/models/tesseract_ocr_cli_model.py +63 -56
- docling/models/tesseract_ocr_model.py +58 -50
- docling/pipeline/base_pipeline.py +190 -0
- docling/pipeline/simple_pipeline.py +59 -0
- docling/pipeline/standard_pdf_pipeline.py +198 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling-2.1.0.dist-info/METADATA +149 -0
- docling-2.1.0.dist-info/RECORD +42 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0
docling/datamodel/document.py
CHANGED
@@ -1,87 +1,110 @@
|
|
1
1
|
import logging
|
2
|
+
import re
|
3
|
+
from enum import Enum
|
2
4
|
from io import BytesIO
|
3
5
|
from pathlib import Path, PurePath
|
4
|
-
from typing import
|
5
|
-
|
6
|
-
|
7
|
-
from docling_core.types import
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
6
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
|
7
|
+
|
8
|
+
import filetype
|
9
|
+
from docling_core.types.doc import (
|
10
|
+
DocItem,
|
11
|
+
DocItemLabel,
|
12
|
+
DoclingDocument,
|
13
|
+
PictureItem,
|
14
|
+
SectionHeaderItem,
|
15
|
+
TableItem,
|
16
|
+
TextItem,
|
17
|
+
)
|
18
|
+
from docling_core.types.doc.document import ListItem
|
19
|
+
from docling_core.types.legacy_doc.base import (
|
20
|
+
BaseText,
|
21
|
+
Figure,
|
22
|
+
GlmTableCell,
|
23
|
+
PageDimensions,
|
24
|
+
PageReference,
|
25
|
+
Prov,
|
26
|
+
Ref,
|
27
|
+
)
|
28
|
+
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
29
|
+
from docling_core.types.legacy_doc.base import TableCell
|
30
|
+
from docling_core.types.legacy_doc.document import (
|
31
|
+
CCSDocumentDescription as DsDocumentDescription,
|
32
|
+
)
|
33
|
+
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
34
|
+
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
35
|
+
from docling_core.utils.file import resolve_file_source
|
15
36
|
from pydantic import BaseModel
|
16
37
|
from typing_extensions import deprecated
|
17
38
|
|
18
|
-
from docling.backend.abstract_backend import
|
19
|
-
|
39
|
+
from docling.backend.abstract_backend import (
|
40
|
+
AbstractDocumentBackend,
|
41
|
+
PaginatedDocumentBackend,
|
42
|
+
)
|
20
43
|
from docling.datamodel.base_models import (
|
21
44
|
AssembledUnit,
|
22
45
|
ConversionStatus,
|
23
46
|
DocumentStream,
|
24
47
|
ErrorItem,
|
25
|
-
|
48
|
+
InputFormat,
|
49
|
+
MimeTypeToFormat,
|
26
50
|
Page,
|
27
|
-
PageElement,
|
28
|
-
TableElement,
|
29
|
-
TextElement,
|
30
51
|
)
|
31
52
|
from docling.datamodel.settings import DocumentLimits
|
32
|
-
from docling.utils.utils import create_file_hash
|
53
|
+
from docling.utils.utils import create_file_hash, create_hash
|
54
|
+
|
55
|
+
if TYPE_CHECKING:
|
56
|
+
from docling.document_converter import FormatOption
|
33
57
|
|
34
58
|
_log = logging.getLogger(__name__)
|
35
59
|
|
36
60
|
layout_label_to_ds_type = {
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
61
|
+
DocItemLabel.TITLE: "title",
|
62
|
+
DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
|
63
|
+
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
|
64
|
+
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
|
65
|
+
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
|
66
|
+
DocItemLabel.CAPTION: "caption",
|
67
|
+
DocItemLabel.PAGE_HEADER: "page-header",
|
68
|
+
DocItemLabel.PAGE_FOOTER: "page-footer",
|
69
|
+
DocItemLabel.FOOTNOTE: "footnote",
|
70
|
+
DocItemLabel.TABLE: "table",
|
71
|
+
DocItemLabel.FORMULA: "equation",
|
72
|
+
DocItemLabel.LIST_ITEM: "paragraph",
|
73
|
+
DocItemLabel.CODE: "paragraph",
|
74
|
+
DocItemLabel.PICTURE: "figure",
|
75
|
+
DocItemLabel.TEXT: "paragraph",
|
76
|
+
DocItemLabel.PARAGRAPH: "paragraph",
|
52
77
|
}
|
53
78
|
|
54
|
-
|
55
|
-
_name="",
|
56
|
-
description=DsDocumentDescription(logs=[]),
|
57
|
-
file_info=DsFileInfoObject(
|
58
|
-
filename="",
|
59
|
-
document_hash="",
|
60
|
-
),
|
61
|
-
)
|
79
|
+
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
62
80
|
|
63
81
|
|
64
82
|
class InputDocument(BaseModel):
|
65
|
-
file: PurePath
|
66
|
-
document_hash:
|
67
|
-
valid: bool =
|
83
|
+
file: PurePath
|
84
|
+
document_hash: str # = None
|
85
|
+
valid: bool = True
|
68
86
|
limits: DocumentLimits = DocumentLimits()
|
87
|
+
format: InputFormat # = None
|
69
88
|
|
70
89
|
filesize: Optional[int] = None
|
71
|
-
page_count:
|
90
|
+
page_count: int = 0
|
72
91
|
|
73
|
-
_backend:
|
92
|
+
_backend: AbstractDocumentBackend # Internal PDF backend used
|
74
93
|
|
75
94
|
def __init__(
|
76
95
|
self,
|
77
96
|
path_or_stream: Union[BytesIO, Path],
|
97
|
+
format: InputFormat,
|
98
|
+
backend: Type[AbstractDocumentBackend],
|
78
99
|
filename: Optional[str] = None,
|
79
100
|
limits: Optional[DocumentLimits] = None,
|
80
|
-
pdf_backend=DoclingParseDocumentBackend,
|
81
101
|
):
|
82
|
-
super().__init__(
|
102
|
+
super().__init__(
|
103
|
+
file="", document_hash="", format=InputFormat.PDF
|
104
|
+
) # initialize with dummy values
|
83
105
|
|
84
106
|
self.limits = limits or DocumentLimits()
|
107
|
+
self.format = format
|
85
108
|
|
86
109
|
try:
|
87
110
|
if isinstance(path_or_stream, Path):
|
@@ -91,11 +114,12 @@ class InputDocument(BaseModel):
|
|
91
114
|
self.valid = False
|
92
115
|
else:
|
93
116
|
self.document_hash = create_file_hash(path_or_stream)
|
94
|
-
self.
|
95
|
-
path_or_stream=path_or_stream, document_hash=self.document_hash
|
96
|
-
)
|
117
|
+
self._init_doc(backend, path_or_stream)
|
97
118
|
|
98
119
|
elif isinstance(path_or_stream, BytesIO):
|
120
|
+
assert (
|
121
|
+
filename is not None
|
122
|
+
), "Can't construct InputDocument from stream without providing filename arg."
|
99
123
|
self.file = PurePath(filename)
|
100
124
|
self.filesize = path_or_stream.getbuffer().nbytes
|
101
125
|
|
@@ -103,15 +127,20 @@ class InputDocument(BaseModel):
|
|
103
127
|
self.valid = False
|
104
128
|
else:
|
105
129
|
self.document_hash = create_file_hash(path_or_stream)
|
106
|
-
self.
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
self.page_count = self._backend.page_count()
|
130
|
+
self._init_doc(backend, path_or_stream)
|
131
|
+
else:
|
132
|
+
raise RuntimeError(
|
133
|
+
f"Unexpected type path_or_stream: {type(path_or_stream)}"
|
134
|
+
)
|
112
135
|
|
113
|
-
|
114
|
-
|
136
|
+
# For paginated backends, check if the maximum page count is exceeded.
|
137
|
+
if self.valid and self._backend.is_valid():
|
138
|
+
if self._backend.supports_pagination() and isinstance(
|
139
|
+
self._backend, PaginatedDocumentBackend
|
140
|
+
):
|
141
|
+
self.page_count = self._backend.page_count()
|
142
|
+
if not self.page_count <= self.limits.max_num_pages:
|
143
|
+
self.valid = False
|
115
144
|
|
116
145
|
except (FileNotFoundError, OSError) as e:
|
117
146
|
_log.exception(
|
@@ -125,9 +154,26 @@ class InputDocument(BaseModel):
|
|
125
154
|
)
|
126
155
|
# raise
|
127
156
|
|
157
|
+
def _init_doc(
|
158
|
+
self,
|
159
|
+
backend: Type[AbstractDocumentBackend],
|
160
|
+
path_or_stream: Union[BytesIO, Path],
|
161
|
+
) -> None:
|
162
|
+
if backend is None:
|
163
|
+
raise RuntimeError(
|
164
|
+
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
165
|
+
f"Please check your format configuration on DocumentConverter."
|
166
|
+
)
|
167
|
+
|
168
|
+
self._backend = backend(self, path_or_stream=path_or_stream)
|
128
169
|
|
129
|
-
|
130
|
-
class
|
170
|
+
|
171
|
+
class DocumentFormat(str, Enum):
|
172
|
+
V2 = "v2"
|
173
|
+
V1 = "v1"
|
174
|
+
|
175
|
+
|
176
|
+
class ConversionResult(BaseModel):
|
131
177
|
input: InputDocument
|
132
178
|
|
133
179
|
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
@@ -136,15 +182,42 @@ class ConvertedDocument(BaseModel):
|
|
136
182
|
pages: List[Page] = []
|
137
183
|
assembled: AssembledUnit = AssembledUnit()
|
138
184
|
|
139
|
-
|
185
|
+
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
186
|
+
|
187
|
+
@property
|
188
|
+
@deprecated("Use document instead.")
|
189
|
+
def legacy_document(self):
|
190
|
+
reverse_label_mapping = {
|
191
|
+
DocItemLabel.CAPTION.value: "Caption",
|
192
|
+
DocItemLabel.FOOTNOTE.value: "Footnote",
|
193
|
+
DocItemLabel.FORMULA.value: "Formula",
|
194
|
+
DocItemLabel.LIST_ITEM.value: "List-item",
|
195
|
+
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
|
196
|
+
DocItemLabel.PAGE_HEADER.value: "Page-header",
|
197
|
+
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
|
198
|
+
DocItemLabel.SECTION_HEADER.value: "Section-header",
|
199
|
+
DocItemLabel.TABLE.value: "Table",
|
200
|
+
DocItemLabel.TEXT.value: "Text",
|
201
|
+
DocItemLabel.TITLE.value: "Title",
|
202
|
+
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
|
203
|
+
DocItemLabel.CODE.value: "Code",
|
204
|
+
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
|
205
|
+
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
|
206
|
+
DocItemLabel.FORM.value: "Form",
|
207
|
+
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
|
208
|
+
DocItemLabel.PARAGRAPH.value: "paragraph",
|
209
|
+
}
|
140
210
|
|
141
|
-
def _to_ds_document(self) -> DsDocument:
|
142
211
|
title = ""
|
143
212
|
desc = DsDocumentDescription(logs=[])
|
144
213
|
|
145
214
|
page_hashes = [
|
146
|
-
PageReference(
|
147
|
-
|
215
|
+
PageReference(
|
216
|
+
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
|
217
|
+
page=p.page_no,
|
218
|
+
model="default",
|
219
|
+
)
|
220
|
+
for p in self.document.pages.values()
|
148
221
|
]
|
149
222
|
|
150
223
|
file_info = DsFileInfoObject(
|
@@ -157,145 +230,199 @@ class ConvertedDocument(BaseModel):
|
|
157
230
|
main_text = []
|
158
231
|
tables = []
|
159
232
|
figures = []
|
233
|
+
equations = []
|
234
|
+
footnotes = []
|
235
|
+
page_headers = []
|
236
|
+
page_footers = []
|
237
|
+
|
238
|
+
embedded_captions = set()
|
239
|
+
for ix, (item, level) in enumerate(
|
240
|
+
self.document.iterate_items(self.document.body)
|
241
|
+
):
|
242
|
+
|
243
|
+
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
|
244
|
+
caption = item.caption_text(self.document)
|
245
|
+
if caption:
|
246
|
+
embedded_captions.add(caption)
|
247
|
+
|
248
|
+
for item, level in self.document.iterate_items():
|
249
|
+
if isinstance(item, DocItem):
|
250
|
+
item_type = item.label
|
251
|
+
|
252
|
+
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
|
253
|
+
|
254
|
+
if isinstance(item, ListItem) and item.marker:
|
255
|
+
text = f"{item.marker} {item.text}"
|
256
|
+
else:
|
257
|
+
text = item.text
|
258
|
+
|
259
|
+
# Can be empty.
|
260
|
+
prov = [
|
261
|
+
Prov(
|
262
|
+
bbox=p.bbox.as_tuple(),
|
263
|
+
page=p.page_no,
|
264
|
+
span=[0, len(item.text)],
|
265
|
+
)
|
266
|
+
for p in item.prov
|
267
|
+
]
|
268
|
+
main_text.append(
|
269
|
+
BaseText(
|
270
|
+
text=text,
|
271
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
272
|
+
name=reverse_label_mapping[item.label],
|
273
|
+
prov=prov,
|
274
|
+
)
|
275
|
+
)
|
160
276
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
176
|
-
name=element.label,
|
177
|
-
prov=[
|
178
|
-
Prov(
|
179
|
-
bbox=target_bbox,
|
180
|
-
page=element.page_no + 1,
|
181
|
-
span=[0, len(element.text)],
|
182
|
-
)
|
183
|
-
],
|
277
|
+
# skip captions of they are embedded in the actual
|
278
|
+
# floating object
|
279
|
+
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
280
|
+
continue
|
281
|
+
|
282
|
+
elif isinstance(item, TableItem) and item.data:
|
283
|
+
index = len(tables)
|
284
|
+
ref_str = f"#/tables/{index}"
|
285
|
+
main_text.append(
|
286
|
+
Ref(
|
287
|
+
name=reverse_label_mapping[item.label],
|
288
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
289
|
+
ref=ref_str,
|
290
|
+
),
|
184
291
|
)
|
185
|
-
)
|
186
|
-
elif isinstance(element, TableElement):
|
187
|
-
index = len(tables)
|
188
|
-
ref_str = f"#/tables/{index}"
|
189
|
-
main_text.append(
|
190
|
-
Ref(
|
191
|
-
name=element.label,
|
192
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
193
|
-
ref=ref_str,
|
194
|
-
),
|
195
|
-
)
|
196
292
|
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
293
|
+
# Initialise empty table data grid (only empty cells)
|
294
|
+
table_data = [
|
295
|
+
[
|
296
|
+
TableCell(
|
297
|
+
text="",
|
298
|
+
# bbox=[0,0,0,0],
|
299
|
+
spans=[[i, j]],
|
300
|
+
obj_type="body",
|
301
|
+
)
|
302
|
+
for j in range(item.data.num_cols)
|
303
|
+
]
|
304
|
+
for i in range(item.data.num_rows)
|
207
305
|
]
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
min(cell.start_row_offset_idx, element.num_rows),
|
215
|
-
min(cell.end_row_offset_idx, element.num_rows),
|
216
|
-
):
|
217
|
-
for j in range(
|
218
|
-
min(cell.start_col_offset_idx, element.num_cols),
|
219
|
-
min(cell.end_col_offset_idx, element.num_cols),
|
306
|
+
|
307
|
+
# Overwrite cells in table data for which there is actual cell content.
|
308
|
+
for cell in item.data.table_cells:
|
309
|
+
for i in range(
|
310
|
+
min(cell.start_row_offset_idx, item.data.num_rows),
|
311
|
+
min(cell.end_row_offset_idx, item.data.num_rows),
|
220
312
|
):
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
celltype = "
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
):
|
234
|
-
for
|
313
|
+
for j in range(
|
314
|
+
min(cell.start_col_offset_idx, item.data.num_cols),
|
315
|
+
min(cell.end_col_offset_idx, item.data.num_cols),
|
316
|
+
):
|
317
|
+
celltype = "body"
|
318
|
+
if cell.column_header:
|
319
|
+
celltype = "col_header"
|
320
|
+
elif cell.row_header:
|
321
|
+
celltype = "row_header"
|
322
|
+
elif cell.row_section:
|
323
|
+
celltype = "row_section"
|
324
|
+
|
325
|
+
def make_spans(cell):
|
326
|
+
for rspan in range(
|
235
327
|
min(
|
236
|
-
cell.
|
328
|
+
cell.start_row_offset_idx,
|
329
|
+
item.data.num_rows,
|
330
|
+
),
|
331
|
+
min(
|
332
|
+
cell.end_row_offset_idx, item.data.num_rows
|
237
333
|
),
|
238
|
-
min(cell.end_col_offset_idx, element.num_cols),
|
239
334
|
):
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
335
|
+
for cspan in range(
|
336
|
+
min(
|
337
|
+
cell.start_col_offset_idx,
|
338
|
+
item.data.num_cols,
|
339
|
+
),
|
340
|
+
min(
|
341
|
+
cell.end_col_offset_idx,
|
342
|
+
item.data.num_cols,
|
343
|
+
),
|
344
|
+
):
|
345
|
+
yield [rspan, cspan]
|
346
|
+
|
347
|
+
spans = list(make_spans(cell))
|
348
|
+
table_data[i][j] = GlmTableCell(
|
349
|
+
text=cell.text,
|
350
|
+
bbox=(
|
351
|
+
cell.bbox.as_tuple()
|
352
|
+
if cell.bbox is not None
|
353
|
+
else None
|
354
|
+
), # check if this is bottom-left
|
355
|
+
spans=spans,
|
356
|
+
obj_type=celltype,
|
357
|
+
col=j,
|
358
|
+
row=i,
|
359
|
+
row_header=cell.row_header,
|
360
|
+
row_section=cell.row_section,
|
361
|
+
col_header=cell.column_header,
|
362
|
+
row_span=[
|
363
|
+
cell.start_row_offset_idx,
|
364
|
+
cell.end_row_offset_idx,
|
365
|
+
],
|
366
|
+
col_span=[
|
367
|
+
cell.start_col_offset_idx,
|
368
|
+
cell.end_col_offset_idx,
|
369
|
+
],
|
370
|
+
)
|
371
|
+
|
372
|
+
# Compute the caption
|
373
|
+
caption = item.caption_text(self.document)
|
374
|
+
|
375
|
+
tables.append(
|
376
|
+
DsSchemaTable(
|
377
|
+
text=caption,
|
378
|
+
num_cols=item.data.num_cols,
|
379
|
+
num_rows=item.data.num_rows,
|
380
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
381
|
+
data=table_data,
|
382
|
+
prov=[
|
383
|
+
Prov(
|
384
|
+
bbox=p.bbox.as_tuple(),
|
385
|
+
page=p.page_no,
|
386
|
+
span=[0, 0],
|
387
|
+
)
|
388
|
+
for p in item.prov
|
389
|
+
],
|
390
|
+
)
|
391
|
+
)
|
255
392
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
page=element.page_no + 1,
|
266
|
-
span=[0, 0],
|
267
|
-
)
|
268
|
-
],
|
393
|
+
elif isinstance(item, PictureItem):
|
394
|
+
index = len(figures)
|
395
|
+
ref_str = f"#/figures/{index}"
|
396
|
+
main_text.append(
|
397
|
+
Ref(
|
398
|
+
name=reverse_label_mapping[item.label],
|
399
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
400
|
+
ref=ref_str,
|
401
|
+
),
|
269
402
|
)
|
270
|
-
)
|
271
403
|
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
)
|
290
|
-
],
|
291
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
292
|
-
# data=[[]],
|
404
|
+
# Compute the caption
|
405
|
+
caption = item.caption_text(self.document)
|
406
|
+
|
407
|
+
figures.append(
|
408
|
+
Figure(
|
409
|
+
prov=[
|
410
|
+
Prov(
|
411
|
+
bbox=p.bbox.as_tuple(),
|
412
|
+
page=p.page_no,
|
413
|
+
span=[0, len(caption)],
|
414
|
+
)
|
415
|
+
for p in item.prov
|
416
|
+
],
|
417
|
+
obj_type=layout_label_to_ds_type.get(item.label),
|
418
|
+
text=caption,
|
419
|
+
# data=[[]],
|
420
|
+
)
|
293
421
|
)
|
294
|
-
)
|
295
422
|
|
296
423
|
page_dimensions = [
|
297
|
-
PageDimensions(page=p.page_no
|
298
|
-
for p in self.pages
|
424
|
+
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
425
|
+
for p in self.document.pages.values()
|
299
426
|
]
|
300
427
|
|
301
428
|
ds_doc = DsDocument(
|
@@ -303,6 +430,10 @@ class ConvertedDocument(BaseModel):
|
|
303
430
|
description=desc,
|
304
431
|
file_info=file_info,
|
305
432
|
main_text=main_text,
|
433
|
+
equations=equations,
|
434
|
+
footnotes=footnotes,
|
435
|
+
page_headers=page_headers,
|
436
|
+
page_footers=page_footers,
|
306
437
|
tables=tables,
|
307
438
|
figures=figures,
|
308
439
|
page_dimensions=page_dimensions,
|
@@ -310,152 +441,76 @@ class ConvertedDocument(BaseModel):
|
|
310
441
|
|
311
442
|
return ds_doc
|
312
443
|
|
313
|
-
def render_as_dict(self):
|
314
|
-
return self.output.model_dump(by_alias=True, exclude_none=True)
|
315
|
-
|
316
|
-
def render_as_markdown(
|
317
|
-
self,
|
318
|
-
delim: str = "\n\n",
|
319
|
-
main_text_start: int = 0,
|
320
|
-
main_text_stop: Optional[int] = None,
|
321
|
-
main_text_labels: list[str] = [
|
322
|
-
"title",
|
323
|
-
"subtitle-level-1",
|
324
|
-
"paragraph",
|
325
|
-
"caption",
|
326
|
-
"table",
|
327
|
-
"figure",
|
328
|
-
],
|
329
|
-
strict_text: bool = False,
|
330
|
-
image_placeholder: str = "<!-- image -->",
|
331
|
-
):
|
332
|
-
return self.output.export_to_markdown(
|
333
|
-
delim=delim,
|
334
|
-
main_text_start=main_text_start,
|
335
|
-
main_text_stop=main_text_stop,
|
336
|
-
main_text_labels=main_text_labels,
|
337
|
-
strict_text=strict_text,
|
338
|
-
image_placeholder=image_placeholder,
|
339
|
-
)
|
340
|
-
|
341
|
-
def render_as_text(
|
342
|
-
self,
|
343
|
-
delim: str = "\n\n",
|
344
|
-
main_text_start: int = 0,
|
345
|
-
main_text_stop: Optional[int] = None,
|
346
|
-
main_text_labels: list[str] = [
|
347
|
-
"title",
|
348
|
-
"subtitle-level-1",
|
349
|
-
"paragraph",
|
350
|
-
"caption",
|
351
|
-
],
|
352
|
-
):
|
353
|
-
return self.output.export_to_markdown(
|
354
|
-
delim=delim,
|
355
|
-
main_text_start=main_text_start,
|
356
|
-
main_text_stop=main_text_stop,
|
357
|
-
main_text_labels=main_text_labels,
|
358
|
-
strict_text=True,
|
359
|
-
)
|
360
444
|
|
361
|
-
|
362
|
-
self,
|
363
|
-
delim: str = "\n\n",
|
364
|
-
main_text_start: int = 0,
|
365
|
-
main_text_stop: Optional[int] = None,
|
366
|
-
main_text_labels: list[str] = [
|
367
|
-
"title",
|
368
|
-
"subtitle-level-1",
|
369
|
-
"paragraph",
|
370
|
-
"caption",
|
371
|
-
"table",
|
372
|
-
"figure",
|
373
|
-
],
|
374
|
-
xsize: int = 100,
|
375
|
-
ysize: int = 100,
|
376
|
-
add_location: bool = True,
|
377
|
-
add_content: bool = True,
|
378
|
-
add_page_index: bool = True,
|
379
|
-
# table specific flags
|
380
|
-
add_table_cell_location: bool = False,
|
381
|
-
add_table_cell_label: bool = True,
|
382
|
-
add_table_cell_text: bool = True,
|
383
|
-
) -> str:
|
384
|
-
return self.output.export_to_document_tokens(
|
385
|
-
delim=delim,
|
386
|
-
main_text_start=main_text_start,
|
387
|
-
main_text_stop=main_text_stop,
|
388
|
-
main_text_labels=main_text_labels,
|
389
|
-
xsize=xsize,
|
390
|
-
ysize=ysize,
|
391
|
-
add_location=add_location,
|
392
|
-
add_content=add_content,
|
393
|
-
add_page_index=add_page_index,
|
394
|
-
# table specific flags
|
395
|
-
add_table_cell_location=add_table_cell_location,
|
396
|
-
add_table_cell_label=add_table_cell_label,
|
397
|
-
add_table_cell_text=add_table_cell_text,
|
398
|
-
)
|
445
|
+
class _DocumentConversionInput(BaseModel):
|
399
446
|
|
400
|
-
|
401
|
-
self, element_types: Tuple[PageElement] = (FigureElement,)
|
402
|
-
):
|
403
|
-
for element in self.assembled.elements:
|
404
|
-
if isinstance(element, element_types):
|
405
|
-
page_ix = element.page_no
|
406
|
-
scale = self.pages[page_ix]._default_image_scale
|
407
|
-
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
|
408
|
-
page_height=self.pages[page_ix].size.height * scale
|
409
|
-
)
|
410
|
-
|
411
|
-
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
|
412
|
-
yield element, cropped_im
|
413
|
-
|
414
|
-
|
415
|
-
class ConversionResult(ConvertedDocument):
|
416
|
-
pass
|
417
|
-
|
418
|
-
|
419
|
-
class DocumentConversionInput(BaseModel):
|
420
|
-
|
421
|
-
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
447
|
+
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
422
448
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
423
449
|
|
424
|
-
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
|
425
|
-
|
426
450
|
def docs(
|
427
|
-
self,
|
451
|
+
self, format_options: Dict[InputFormat, "FormatOption"]
|
428
452
|
) -> Iterable[InputDocument]:
|
453
|
+
for item in self.path_or_stream_iterator:
|
454
|
+
obj = resolve_file_source(item) if isinstance(item, str) else item
|
455
|
+
format = self._guess_format(obj)
|
456
|
+
if format not in format_options.keys():
|
457
|
+
_log.info(
|
458
|
+
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
459
|
+
)
|
460
|
+
continue
|
461
|
+
else:
|
462
|
+
backend = format_options[format].backend
|
429
463
|
|
430
|
-
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
|
431
|
-
|
432
|
-
for obj in self._path_or_stream_iterator:
|
433
464
|
if isinstance(obj, Path):
|
434
465
|
yield InputDocument(
|
435
|
-
path_or_stream=obj,
|
466
|
+
path_or_stream=obj,
|
467
|
+
format=format,
|
468
|
+
filename=obj.name,
|
469
|
+
limits=self.limits,
|
470
|
+
backend=backend,
|
436
471
|
)
|
437
472
|
elif isinstance(obj, DocumentStream):
|
438
473
|
yield InputDocument(
|
439
474
|
path_or_stream=obj.stream,
|
440
|
-
|
475
|
+
format=format,
|
476
|
+
filename=obj.name,
|
441
477
|
limits=self.limits,
|
442
|
-
|
478
|
+
backend=backend,
|
443
479
|
)
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
480
|
+
else:
|
481
|
+
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
482
|
+
|
483
|
+
def _guess_format(self, obj):
|
484
|
+
content = None
|
485
|
+
if isinstance(obj, Path):
|
486
|
+
mime = filetype.guess_mime(str(obj))
|
487
|
+
if mime is None:
|
488
|
+
with obj.open("rb") as f:
|
489
|
+
content = f.read(1024) # Read first 1KB
|
490
|
+
|
491
|
+
elif isinstance(obj, DocumentStream):
|
492
|
+
obj.stream.seek(0)
|
493
|
+
content = obj.stream.read(8192)
|
494
|
+
obj.stream.seek(0)
|
495
|
+
mime = filetype.guess_mime(content)
|
496
|
+
|
497
|
+
if mime is None:
|
498
|
+
mime = self._detect_html_xhtml(content)
|
499
|
+
|
500
|
+
format = MimeTypeToFormat.get(mime)
|
501
|
+
return format
|
502
|
+
|
503
|
+
def _detect_html_xhtml(self, content):
|
504
|
+
content_str = content.decode("ascii", errors="ignore").lower()
|
505
|
+
# Remove XML comments
|
506
|
+
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
|
507
|
+
content_str = content_str.lstrip()
|
508
|
+
|
509
|
+
if re.match(r"<\?xml", content_str):
|
510
|
+
if "xhtml" in content_str[:1000]:
|
511
|
+
return "application/xhtml+xml"
|
512
|
+
|
513
|
+
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
514
|
+
return "text/html"
|
515
|
+
|
516
|
+
return None
|