docling 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +240 -0
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +379 -324
  12. docling/datamodel/pipeline_options.py +16 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +19 -6
  17. docling/models/ds_glm_model.py +220 -22
  18. docling/models/easyocr_model.py +45 -40
  19. docling/models/layout_model.py +130 -114
  20. docling/models/page_assemble_model.py +119 -95
  21. docling/models/page_preprocessing_model.py +61 -0
  22. docling/models/table_structure_model.py +122 -111
  23. docling/models/tesseract_ocr_cli_model.py +65 -58
  24. docling/models/tesseract_ocr_model.py +58 -50
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.1.0.dist-info/METADATA +149 -0
  31. docling-2.1.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.19.0.dist-info/METADATA +0 -380
  35. docling-1.19.0.dist-info/RECORD +0 -34
  36. {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
  37. {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
  38. {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0
@@ -1,87 +1,110 @@
1
1
  import logging
2
+ import re
3
+ from enum import Enum
2
4
  from io import BytesIO
3
5
  from pathlib import Path, PurePath
4
- from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
5
-
6
- from docling_core.types import BaseCell, BaseText
7
- from docling_core.types import Document as DsDocument
8
- from docling_core.types import DocumentDescription as DsDocumentDescription
9
- from docling_core.types import FileInfoObject as DsFileInfoObject
10
- from docling_core.types import PageDimensions, PageReference, Prov, Ref
11
- from docling_core.types import Table as DsSchemaTable
12
- from docling_core.types import TableCell
13
- from docling_core.types.doc.base import BoundingBox as DsBoundingBox
14
- from docling_core.types.doc.base import Figure
6
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
7
+
8
+ import filetype
9
+ from docling_core.types.doc import (
10
+ DocItem,
11
+ DocItemLabel,
12
+ DoclingDocument,
13
+ PictureItem,
14
+ SectionHeaderItem,
15
+ TableItem,
16
+ TextItem,
17
+ )
18
+ from docling_core.types.doc.document import ListItem
19
+ from docling_core.types.legacy_doc.base import (
20
+ BaseText,
21
+ Figure,
22
+ GlmTableCell,
23
+ PageDimensions,
24
+ PageReference,
25
+ Prov,
26
+ Ref,
27
+ )
28
+ from docling_core.types.legacy_doc.base import Table as DsSchemaTable
29
+ from docling_core.types.legacy_doc.base import TableCell
30
+ from docling_core.types.legacy_doc.document import (
31
+ CCSDocumentDescription as DsDocumentDescription,
32
+ )
33
+ from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
34
+ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
35
+ from docling_core.utils.file import resolve_file_source
15
36
  from pydantic import BaseModel
16
37
  from typing_extensions import deprecated
17
38
 
18
- from docling.backend.abstract_backend import PdfDocumentBackend
19
- from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
39
+ from docling.backend.abstract_backend import (
40
+ AbstractDocumentBackend,
41
+ PaginatedDocumentBackend,
42
+ )
20
43
  from docling.datamodel.base_models import (
21
44
  AssembledUnit,
22
45
  ConversionStatus,
23
46
  DocumentStream,
24
47
  ErrorItem,
25
- FigureElement,
48
+ InputFormat,
49
+ MimeTypeToFormat,
26
50
  Page,
27
- PageElement,
28
- TableElement,
29
- TextElement,
30
51
  )
31
52
  from docling.datamodel.settings import DocumentLimits
32
- from docling.utils.utils import create_file_hash
53
+ from docling.utils.utils import create_file_hash, create_hash
54
+
55
+ if TYPE_CHECKING:
56
+ from docling.document_converter import FormatOption
33
57
 
34
58
  _log = logging.getLogger(__name__)
35
59
 
36
60
  layout_label_to_ds_type = {
37
- "Title": "title",
38
- "Document Index": "table-of-path_or_stream",
39
- "Section-header": "subtitle-level-1",
40
- "Checkbox-Selected": "checkbox-selected",
41
- "Checkbox-Unselected": "checkbox-unselected",
42
- "Caption": "caption",
43
- "Page-header": "page-header",
44
- "Page-footer": "page-footer",
45
- "Footnote": "footnote",
46
- "Table": "table",
47
- "Formula": "equation",
48
- "List-item": "paragraph",
49
- "Code": "paragraph",
50
- "Picture": "figure",
51
- "Text": "paragraph",
61
+ DocItemLabel.TITLE: "title",
62
+ DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
63
+ DocItemLabel.SECTION_HEADER: "subtitle-level-1",
64
+ DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
65
+ DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
66
+ DocItemLabel.CAPTION: "caption",
67
+ DocItemLabel.PAGE_HEADER: "page-header",
68
+ DocItemLabel.PAGE_FOOTER: "page-footer",
69
+ DocItemLabel.FOOTNOTE: "footnote",
70
+ DocItemLabel.TABLE: "table",
71
+ DocItemLabel.FORMULA: "equation",
72
+ DocItemLabel.LIST_ITEM: "paragraph",
73
+ DocItemLabel.CODE: "paragraph",
74
+ DocItemLabel.PICTURE: "figure",
75
+ DocItemLabel.TEXT: "paragraph",
76
+ DocItemLabel.PARAGRAPH: "paragraph",
52
77
  }
53
78
 
54
- _EMPTY_DOC = DsDocument(
55
- _name="",
56
- description=DsDocumentDescription(logs=[]),
57
- file_info=DsFileInfoObject(
58
- filename="",
59
- document_hash="",
60
- ),
61
- )
79
+ _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
62
80
 
63
81
 
64
82
  class InputDocument(BaseModel):
65
- file: PurePath = None
66
- document_hash: Optional[str] = None
67
- valid: bool = False
83
+ file: PurePath
84
+ document_hash: str # = None
85
+ valid: bool = True
68
86
  limits: DocumentLimits = DocumentLimits()
87
+ format: InputFormat # = None
69
88
 
70
89
  filesize: Optional[int] = None
71
- page_count: Optional[int] = None
90
+ page_count: int = 0
72
91
 
73
- _backend: PdfDocumentBackend = None # Internal PDF backend used
92
+ _backend: AbstractDocumentBackend # Internal PDF backend used
74
93
 
75
94
  def __init__(
76
95
  self,
77
96
  path_or_stream: Union[BytesIO, Path],
97
+ format: InputFormat,
98
+ backend: Type[AbstractDocumentBackend],
78
99
  filename: Optional[str] = None,
79
100
  limits: Optional[DocumentLimits] = None,
80
- pdf_backend=DoclingParseDocumentBackend,
81
101
  ):
82
- super().__init__()
102
+ super().__init__(
103
+ file="", document_hash="", format=InputFormat.PDF
104
+ ) # initialize with dummy values
83
105
 
84
106
  self.limits = limits or DocumentLimits()
107
+ self.format = format
85
108
 
86
109
  try:
87
110
  if isinstance(path_or_stream, Path):
@@ -91,11 +114,12 @@ class InputDocument(BaseModel):
91
114
  self.valid = False
92
115
  else:
93
116
  self.document_hash = create_file_hash(path_or_stream)
94
- self._backend = pdf_backend(
95
- path_or_stream=path_or_stream, document_hash=self.document_hash
96
- )
117
+ self._init_doc(backend, path_or_stream)
97
118
 
98
119
  elif isinstance(path_or_stream, BytesIO):
120
+ assert (
121
+ filename is not None
122
+ ), "Can't construct InputDocument from stream without providing filename arg."
99
123
  self.file = PurePath(filename)
100
124
  self.filesize = path_or_stream.getbuffer().nbytes
101
125
 
@@ -103,15 +127,20 @@ class InputDocument(BaseModel):
103
127
  self.valid = False
104
128
  else:
105
129
  self.document_hash = create_file_hash(path_or_stream)
106
- self._backend = pdf_backend(
107
- path_or_stream=path_or_stream, document_hash=self.document_hash
108
- )
109
-
110
- if self.document_hash and self._backend.page_count() > 0:
111
- self.page_count = self._backend.page_count()
130
+ self._init_doc(backend, path_or_stream)
131
+ else:
132
+ raise RuntimeError(
133
+ f"Unexpected type path_or_stream: {type(path_or_stream)}"
134
+ )
112
135
 
113
- if self.page_count <= self.limits.max_num_pages:
114
- self.valid = True
136
+ # For paginated backends, check if the maximum page count is exceeded.
137
+ if self.valid and self._backend.is_valid():
138
+ if self._backend.supports_pagination() and isinstance(
139
+ self._backend, PaginatedDocumentBackend
140
+ ):
141
+ self.page_count = self._backend.page_count()
142
+ if not self.page_count <= self.limits.max_num_pages:
143
+ self.valid = False
115
144
 
116
145
  except (FileNotFoundError, OSError) as e:
117
146
  _log.exception(
@@ -125,9 +154,26 @@ class InputDocument(BaseModel):
125
154
  )
126
155
  # raise
127
156
 
157
+ def _init_doc(
158
+ self,
159
+ backend: Type[AbstractDocumentBackend],
160
+ path_or_stream: Union[BytesIO, Path],
161
+ ) -> None:
162
+ if backend is None:
163
+ raise RuntimeError(
164
+ f"No backend configuration provided for file {self.file.name} with format {self.format}. "
165
+ f"Please check your format configuration on DocumentConverter."
166
+ )
167
+
168
+ self._backend = backend(self, path_or_stream=path_or_stream)
128
169
 
129
- @deprecated("Use `ConversionResult` instead.")
130
- class ConvertedDocument(BaseModel):
170
+
171
+ class DocumentFormat(str, Enum):
172
+ V2 = "v2"
173
+ V1 = "v1"
174
+
175
+
176
+ class ConversionResult(BaseModel):
131
177
  input: InputDocument
132
178
 
133
179
  status: ConversionStatus = ConversionStatus.PENDING # failure, success
@@ -136,15 +182,42 @@ class ConvertedDocument(BaseModel):
136
182
  pages: List[Page] = []
137
183
  assembled: AssembledUnit = AssembledUnit()
138
184
 
139
- output: DsDocument = _EMPTY_DOC
185
+ document: DoclingDocument = _EMPTY_DOCLING_DOC
186
+
187
+ @property
188
+ @deprecated("Use document instead.")
189
+ def legacy_document(self):
190
+ reverse_label_mapping = {
191
+ DocItemLabel.CAPTION.value: "Caption",
192
+ DocItemLabel.FOOTNOTE.value: "Footnote",
193
+ DocItemLabel.FORMULA.value: "Formula",
194
+ DocItemLabel.LIST_ITEM.value: "List-item",
195
+ DocItemLabel.PAGE_FOOTER.value: "Page-footer",
196
+ DocItemLabel.PAGE_HEADER.value: "Page-header",
197
+ DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
198
+ DocItemLabel.SECTION_HEADER.value: "Section-header",
199
+ DocItemLabel.TABLE.value: "Table",
200
+ DocItemLabel.TEXT.value: "Text",
201
+ DocItemLabel.TITLE.value: "Title",
202
+ DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
203
+ DocItemLabel.CODE.value: "Code",
204
+ DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
205
+ DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
206
+ DocItemLabel.FORM.value: "Form",
207
+ DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
208
+ DocItemLabel.PARAGRAPH.value: "paragraph",
209
+ }
140
210
 
141
- def _to_ds_document(self) -> DsDocument:
142
211
  title = ""
143
212
  desc = DsDocumentDescription(logs=[])
144
213
 
145
214
  page_hashes = [
146
- PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
147
- for p in self.pages
215
+ PageReference(
216
+ hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
217
+ page=p.page_no,
218
+ model="default",
219
+ )
220
+ for p in self.document.pages.values()
148
221
  ]
149
222
 
150
223
  file_info = DsFileInfoObject(
@@ -157,145 +230,199 @@ class ConvertedDocument(BaseModel):
157
230
  main_text = []
158
231
  tables = []
159
232
  figures = []
233
+ equations = []
234
+ footnotes = []
235
+ page_headers = []
236
+ page_footers = []
237
+
238
+ embedded_captions = set()
239
+ for ix, (item, level) in enumerate(
240
+ self.document.iterate_items(self.document.body)
241
+ ):
242
+
243
+ if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
244
+ caption = item.caption_text(self.document)
245
+ if caption:
246
+ embedded_captions.add(caption)
247
+
248
+ for item, level in self.document.iterate_items():
249
+ if isinstance(item, DocItem):
250
+ item_type = item.label
251
+
252
+ if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
253
+
254
+ if isinstance(item, ListItem) and item.marker:
255
+ text = f"{item.marker} {item.text}"
256
+ else:
257
+ text = item.text
258
+
259
+ # Can be empty.
260
+ prov = [
261
+ Prov(
262
+ bbox=p.bbox.as_tuple(),
263
+ page=p.page_no,
264
+ span=[0, len(item.text)],
265
+ )
266
+ for p in item.prov
267
+ ]
268
+ main_text.append(
269
+ BaseText(
270
+ text=text,
271
+ obj_type=layout_label_to_ds_type.get(item.label),
272
+ name=reverse_label_mapping[item.label],
273
+ prov=prov,
274
+ )
275
+ )
160
276
 
161
- page_no_to_page = {p.page_no: p for p in self.pages}
162
-
163
- for element in self.assembled.elements:
164
- # Convert bboxes to lower-left origin.
165
- target_bbox = DsBoundingBox(
166
- element.cluster.bbox.to_bottom_left_origin(
167
- page_no_to_page[element.page_no].size.height
168
- ).as_tuple()
169
- )
170
-
171
- if isinstance(element, TextElement):
172
- main_text.append(
173
- BaseText(
174
- text=element.text,
175
- obj_type=layout_label_to_ds_type.get(element.label),
176
- name=element.label,
177
- prov=[
178
- Prov(
179
- bbox=target_bbox,
180
- page=element.page_no + 1,
181
- span=[0, len(element.text)],
182
- )
183
- ],
277
+ # skip captions of they are embedded in the actual
278
+ # floating object
279
+ if item_type == DocItemLabel.CAPTION and text in embedded_captions:
280
+ continue
281
+
282
+ elif isinstance(item, TableItem) and item.data:
283
+ index = len(tables)
284
+ ref_str = f"#/tables/{index}"
285
+ main_text.append(
286
+ Ref(
287
+ name=reverse_label_mapping[item.label],
288
+ obj_type=layout_label_to_ds_type.get(item.label),
289
+ ref=ref_str,
290
+ ),
184
291
  )
185
- )
186
- elif isinstance(element, TableElement):
187
- index = len(tables)
188
- ref_str = f"#/tables/{index}"
189
- main_text.append(
190
- Ref(
191
- name=element.label,
192
- obj_type=layout_label_to_ds_type.get(element.label),
193
- ref=ref_str,
194
- ),
195
- )
196
292
 
197
- # Initialise empty table data grid (only empty cells)
198
- table_data = [
199
- [
200
- TableCell(
201
- text="",
202
- # bbox=[0,0,0,0],
203
- spans=[[i, j]],
204
- obj_type="body",
205
- )
206
- for j in range(element.num_cols)
293
+ # Initialise empty table data grid (only empty cells)
294
+ table_data = [
295
+ [
296
+ TableCell(
297
+ text="",
298
+ # bbox=[0,0,0,0],
299
+ spans=[[i, j]],
300
+ obj_type="body",
301
+ )
302
+ for j in range(item.data.num_cols)
303
+ ]
304
+ for i in range(item.data.num_rows)
207
305
  ]
208
- for i in range(element.num_rows)
209
- ]
210
-
211
- # Overwrite cells in table data for which there is actual cell content.
212
- for cell in element.table_cells:
213
- for i in range(
214
- min(cell.start_row_offset_idx, element.num_rows),
215
- min(cell.end_row_offset_idx, element.num_rows),
216
- ):
217
- for j in range(
218
- min(cell.start_col_offset_idx, element.num_cols),
219
- min(cell.end_col_offset_idx, element.num_cols),
306
+
307
+ # Overwrite cells in table data for which there is actual cell content.
308
+ for cell in item.data.table_cells:
309
+ for i in range(
310
+ min(cell.start_row_offset_idx, item.data.num_rows),
311
+ min(cell.end_row_offset_idx, item.data.num_rows),
220
312
  ):
221
- celltype = "body"
222
- if cell.column_header:
223
- celltype = "col_header"
224
- elif cell.row_header:
225
- celltype = "row_header"
226
- elif cell.row_section:
227
- celltype = "row_section"
228
-
229
- def make_spans(cell):
230
- for rspan in range(
231
- min(cell.start_row_offset_idx, element.num_rows),
232
- min(cell.end_row_offset_idx, element.num_rows),
233
- ):
234
- for cspan in range(
313
+ for j in range(
314
+ min(cell.start_col_offset_idx, item.data.num_cols),
315
+ min(cell.end_col_offset_idx, item.data.num_cols),
316
+ ):
317
+ celltype = "body"
318
+ if cell.column_header:
319
+ celltype = "col_header"
320
+ elif cell.row_header:
321
+ celltype = "row_header"
322
+ elif cell.row_section:
323
+ celltype = "row_section"
324
+
325
+ def make_spans(cell):
326
+ for rspan in range(
235
327
  min(
236
- cell.start_col_offset_idx, element.num_cols
328
+ cell.start_row_offset_idx,
329
+ item.data.num_rows,
330
+ ),
331
+ min(
332
+ cell.end_row_offset_idx, item.data.num_rows
237
333
  ),
238
- min(cell.end_col_offset_idx, element.num_cols),
239
334
  ):
240
- yield [rspan, cspan]
241
-
242
- spans = list(make_spans(cell))
243
- table_data[i][j] = TableCell(
244
- text=cell.text,
245
- bbox=cell.bbox.to_bottom_left_origin(
246
- page_no_to_page[element.page_no].size.height
247
- ).as_tuple(),
248
- # col=j,
249
- # row=i,
250
- spans=spans,
251
- obj_type=celltype,
252
- # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
253
- # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
254
- )
335
+ for cspan in range(
336
+ min(
337
+ cell.start_col_offset_idx,
338
+ item.data.num_cols,
339
+ ),
340
+ min(
341
+ cell.end_col_offset_idx,
342
+ item.data.num_cols,
343
+ ),
344
+ ):
345
+ yield [rspan, cspan]
346
+
347
+ spans = list(make_spans(cell))
348
+ table_data[i][j] = GlmTableCell(
349
+ text=cell.text,
350
+ bbox=(
351
+ cell.bbox.as_tuple()
352
+ if cell.bbox is not None
353
+ else None
354
+ ), # check if this is bottom-left
355
+ spans=spans,
356
+ obj_type=celltype,
357
+ col=j,
358
+ row=i,
359
+ row_header=cell.row_header,
360
+ row_section=cell.row_section,
361
+ col_header=cell.column_header,
362
+ row_span=[
363
+ cell.start_row_offset_idx,
364
+ cell.end_row_offset_idx,
365
+ ],
366
+ col_span=[
367
+ cell.start_col_offset_idx,
368
+ cell.end_col_offset_idx,
369
+ ],
370
+ )
371
+
372
+ # Compute the caption
373
+ caption = item.caption_text(self.document)
374
+
375
+ tables.append(
376
+ DsSchemaTable(
377
+ text=caption,
378
+ num_cols=item.data.num_cols,
379
+ num_rows=item.data.num_rows,
380
+ obj_type=layout_label_to_ds_type.get(item.label),
381
+ data=table_data,
382
+ prov=[
383
+ Prov(
384
+ bbox=p.bbox.as_tuple(),
385
+ page=p.page_no,
386
+ span=[0, 0],
387
+ )
388
+ for p in item.prov
389
+ ],
390
+ )
391
+ )
255
392
 
256
- tables.append(
257
- DsSchemaTable(
258
- num_cols=element.num_cols,
259
- num_rows=element.num_rows,
260
- obj_type=layout_label_to_ds_type.get(element.label),
261
- data=table_data,
262
- prov=[
263
- Prov(
264
- bbox=target_bbox,
265
- page=element.page_no + 1,
266
- span=[0, 0],
267
- )
268
- ],
393
+ elif isinstance(item, PictureItem):
394
+ index = len(figures)
395
+ ref_str = f"#/figures/{index}"
396
+ main_text.append(
397
+ Ref(
398
+ name=reverse_label_mapping[item.label],
399
+ obj_type=layout_label_to_ds_type.get(item.label),
400
+ ref=ref_str,
401
+ ),
269
402
  )
270
- )
271
403
 
272
- elif isinstance(element, FigureElement):
273
- index = len(figures)
274
- ref_str = f"#/figures/{index}"
275
- main_text.append(
276
- Ref(
277
- name=element.label,
278
- obj_type=layout_label_to_ds_type.get(element.label),
279
- ref=ref_str,
280
- ),
281
- )
282
- figures.append(
283
- Figure(
284
- prov=[
285
- Prov(
286
- bbox=target_bbox,
287
- page=element.page_no + 1,
288
- span=[0, 0],
289
- )
290
- ],
291
- obj_type=layout_label_to_ds_type.get(element.label),
292
- # data=[[]],
404
+ # Compute the caption
405
+ caption = item.caption_text(self.document)
406
+
407
+ figures.append(
408
+ Figure(
409
+ prov=[
410
+ Prov(
411
+ bbox=p.bbox.as_tuple(),
412
+ page=p.page_no,
413
+ span=[0, len(caption)],
414
+ )
415
+ for p in item.prov
416
+ ],
417
+ obj_type=layout_label_to_ds_type.get(item.label),
418
+ text=caption,
419
+ # data=[[]],
420
+ )
293
421
  )
294
- )
295
422
 
296
423
  page_dimensions = [
297
- PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
298
- for p in self.pages
424
+ PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
425
+ for p in self.document.pages.values()
299
426
  ]
300
427
 
301
428
  ds_doc = DsDocument(
@@ -303,6 +430,10 @@ class ConvertedDocument(BaseModel):
303
430
  description=desc,
304
431
  file_info=file_info,
305
432
  main_text=main_text,
433
+ equations=equations,
434
+ footnotes=footnotes,
435
+ page_headers=page_headers,
436
+ page_footers=page_footers,
306
437
  tables=tables,
307
438
  figures=figures,
308
439
  page_dimensions=page_dimensions,
@@ -310,152 +441,76 @@ class ConvertedDocument(BaseModel):
310
441
 
311
442
  return ds_doc
312
443
 
313
- def render_as_dict(self):
314
- return self.output.model_dump(by_alias=True, exclude_none=True)
315
-
316
- def render_as_markdown(
317
- self,
318
- delim: str = "\n\n",
319
- main_text_start: int = 0,
320
- main_text_stop: Optional[int] = None,
321
- main_text_labels: list[str] = [
322
- "title",
323
- "subtitle-level-1",
324
- "paragraph",
325
- "caption",
326
- "table",
327
- "figure",
328
- ],
329
- strict_text: bool = False,
330
- image_placeholder: str = "<!-- image -->",
331
- ):
332
- return self.output.export_to_markdown(
333
- delim=delim,
334
- main_text_start=main_text_start,
335
- main_text_stop=main_text_stop,
336
- main_text_labels=main_text_labels,
337
- strict_text=strict_text,
338
- image_placeholder=image_placeholder,
339
- )
340
-
341
- def render_as_text(
342
- self,
343
- delim: str = "\n\n",
344
- main_text_start: int = 0,
345
- main_text_stop: Optional[int] = None,
346
- main_text_labels: list[str] = [
347
- "title",
348
- "subtitle-level-1",
349
- "paragraph",
350
- "caption",
351
- ],
352
- ):
353
- return self.output.export_to_markdown(
354
- delim=delim,
355
- main_text_start=main_text_start,
356
- main_text_stop=main_text_stop,
357
- main_text_labels=main_text_labels,
358
- strict_text=True,
359
- )
360
444
 
361
- def render_as_doctags(
362
- self,
363
- delim: str = "\n\n",
364
- main_text_start: int = 0,
365
- main_text_stop: Optional[int] = None,
366
- main_text_labels: list[str] = [
367
- "title",
368
- "subtitle-level-1",
369
- "paragraph",
370
- "caption",
371
- "table",
372
- "figure",
373
- ],
374
- xsize: int = 100,
375
- ysize: int = 100,
376
- add_location: bool = True,
377
- add_content: bool = True,
378
- add_page_index: bool = True,
379
- # table specific flags
380
- add_table_cell_location: bool = False,
381
- add_table_cell_label: bool = True,
382
- add_table_cell_text: bool = True,
383
- ) -> str:
384
- return self.output.export_to_document_tokens(
385
- delim=delim,
386
- main_text_start=main_text_start,
387
- main_text_stop=main_text_stop,
388
- main_text_labels=main_text_labels,
389
- xsize=xsize,
390
- ysize=ysize,
391
- add_location=add_location,
392
- add_content=add_content,
393
- add_page_index=add_page_index,
394
- # table specific flags
395
- add_table_cell_location=add_table_cell_location,
396
- add_table_cell_label=add_table_cell_label,
397
- add_table_cell_text=add_table_cell_text,
398
- )
445
+ class _DocumentConversionInput(BaseModel):
399
446
 
400
- def render_element_images(
401
- self, element_types: Tuple[PageElement] = (FigureElement,)
402
- ):
403
- for element in self.assembled.elements:
404
- if isinstance(element, element_types):
405
- page_ix = element.page_no
406
- scale = self.pages[page_ix]._default_image_scale
407
- crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
408
- page_height=self.pages[page_ix].size.height * scale
409
- )
410
-
411
- cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
412
- yield element, cropped_im
413
-
414
-
415
- class ConversionResult(ConvertedDocument):
416
- pass
417
-
418
-
419
- class DocumentConversionInput(BaseModel):
420
-
421
- _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
447
+ path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
422
448
  limits: Optional[DocumentLimits] = DocumentLimits()
423
449
 
424
- DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
425
-
426
450
  def docs(
427
- self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
451
+ self, format_options: Dict[InputFormat, "FormatOption"]
428
452
  ) -> Iterable[InputDocument]:
453
+ for item in self.path_or_stream_iterator:
454
+ obj = resolve_file_source(item) if isinstance(item, str) else item
455
+ format = self._guess_format(obj)
456
+ if format not in format_options.keys():
457
+ _log.info(
458
+ f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
459
+ )
460
+ continue
461
+ else:
462
+ backend = format_options[format].backend
429
463
 
430
- pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
431
-
432
- for obj in self._path_or_stream_iterator:
433
464
  if isinstance(obj, Path):
434
465
  yield InputDocument(
435
- path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
466
+ path_or_stream=obj,
467
+ format=format,
468
+ filename=obj.name,
469
+ limits=self.limits,
470
+ backend=backend,
436
471
  )
437
472
  elif isinstance(obj, DocumentStream):
438
473
  yield InputDocument(
439
474
  path_or_stream=obj.stream,
440
- filename=obj.filename,
475
+ format=format,
476
+ filename=obj.name,
441
477
  limits=self.limits,
442
- pdf_backend=pdf_backend,
478
+ backend=backend,
443
479
  )
444
-
445
- @classmethod
446
- def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
447
- paths = [Path(p) for p in paths]
448
-
449
- doc_input = cls(limits=limits)
450
- doc_input._path_or_stream_iterator = paths
451
-
452
- return doc_input
453
-
454
- @classmethod
455
- def from_streams(
456
- cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
457
- ):
458
- doc_input = cls(limits=limits)
459
- doc_input._path_or_stream_iterator = streams
460
-
461
- return doc_input
480
+ else:
481
+ raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
482
+
483
+ def _guess_format(self, obj):
484
+ content = None
485
+ if isinstance(obj, Path):
486
+ mime = filetype.guess_mime(str(obj))
487
+ if mime is None:
488
+ with obj.open("rb") as f:
489
+ content = f.read(1024) # Read first 1KB
490
+
491
+ elif isinstance(obj, DocumentStream):
492
+ obj.stream.seek(0)
493
+ content = obj.stream.read(8192)
494
+ obj.stream.seek(0)
495
+ mime = filetype.guess_mime(content)
496
+
497
+ if mime is None:
498
+ mime = self._detect_html_xhtml(content)
499
+
500
+ format = MimeTypeToFormat.get(mime)
501
+ return format
502
+
503
+ def _detect_html_xhtml(self, content):
504
+ content_str = content.decode("ascii", errors="ignore").lower()
505
+ # Remove XML comments
506
+ content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
507
+ content_str = content_str.lstrip()
508
+
509
+ if re.match(r"<\?xml", content_str):
510
+ if "xhtml" in content_str[:1000]:
511
+ return "application/xhtml+xml"
512
+
513
+ if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
514
+ return "text/html"
515
+
516
+ return None