docling 1.20.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +15 -11
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +364 -318
  12. docling/datamodel/pipeline_options.py +13 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +10 -5
  17. docling/models/ds_glm_model.py +209 -20
  18. docling/models/easyocr_model.py +4 -1
  19. docling/models/layout_model.py +73 -61
  20. docling/models/page_assemble_model.py +21 -5
  21. docling/models/page_preprocessing_model.py +57 -0
  22. docling/models/table_structure_model.py +34 -32
  23. docling/models/tesseract_ocr_cli_model.py +8 -5
  24. docling/models/tesseract_ocr_model.py +8 -5
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.0.0.dist-info/METADATA +149 -0
  31. docling-2.0.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.20.0.dist-info/METADATA +0 -380
  35. docling-1.20.0.dist-info/RECORD +0 -35
  36. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
  37. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
  38. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0
@@ -1,87 +1,101 @@
1
1
  import logging
2
+ import re
3
+ from enum import Enum
2
4
  from io import BytesIO
3
5
  from pathlib import Path, PurePath
4
- from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
6
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
5
7
 
6
- from docling_core.types import BaseCell, BaseText
8
+ import filetype
9
+ from docling_core.types import BaseText
7
10
  from docling_core.types import Document as DsDocument
8
11
  from docling_core.types import DocumentDescription as DsDocumentDescription
9
12
  from docling_core.types import FileInfoObject as DsFileInfoObject
10
13
  from docling_core.types import PageDimensions, PageReference, Prov, Ref
11
14
  from docling_core.types import Table as DsSchemaTable
12
- from docling_core.types import TableCell
13
- from docling_core.types.doc.base import BoundingBox as DsBoundingBox
14
- from docling_core.types.doc.base import Figure
15
+ from docling_core.types.doc import (
16
+ DocItem,
17
+ DocItemLabel,
18
+ DoclingDocument,
19
+ PictureItem,
20
+ SectionHeaderItem,
21
+ TableItem,
22
+ TextItem,
23
+ )
24
+ from docling_core.types.doc.document import ListItem
25
+ from docling_core.types.legacy_doc.base import Figure, GlmTableCell, TableCell
26
+ from docling_core.utils.file import resolve_file_source
15
27
  from pydantic import BaseModel
16
28
  from typing_extensions import deprecated
17
29
 
18
- from docling.backend.abstract_backend import PdfDocumentBackend
19
- from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
30
+ from docling.backend.abstract_backend import (
31
+ AbstractDocumentBackend,
32
+ PaginatedDocumentBackend,
33
+ )
20
34
  from docling.datamodel.base_models import (
21
35
  AssembledUnit,
22
36
  ConversionStatus,
23
37
  DocumentStream,
24
38
  ErrorItem,
25
- FigureElement,
39
+ InputFormat,
40
+ MimeTypeToFormat,
26
41
  Page,
27
- PageElement,
28
- TableElement,
29
- TextElement,
30
42
  )
31
43
  from docling.datamodel.settings import DocumentLimits
32
- from docling.utils.utils import create_file_hash
44
+ from docling.utils.utils import create_file_hash, create_hash
45
+
46
+ if TYPE_CHECKING:
47
+ from docling.document_converter import FormatOption
33
48
 
34
49
  _log = logging.getLogger(__name__)
35
50
 
36
51
  layout_label_to_ds_type = {
37
- "Title": "title",
38
- "Document Index": "table-of-path_or_stream",
39
- "Section-header": "subtitle-level-1",
40
- "Checkbox-Selected": "checkbox-selected",
41
- "Checkbox-Unselected": "checkbox-unselected",
42
- "Caption": "caption",
43
- "Page-header": "page-header",
44
- "Page-footer": "page-footer",
45
- "Footnote": "footnote",
46
- "Table": "table",
47
- "Formula": "equation",
48
- "List-item": "paragraph",
49
- "Code": "paragraph",
50
- "Picture": "figure",
51
- "Text": "paragraph",
52
+ DocItemLabel.TITLE: "title",
53
+ DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
54
+ DocItemLabel.SECTION_HEADER: "subtitle-level-1",
55
+ DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
56
+ DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
57
+ DocItemLabel.CAPTION: "caption",
58
+ DocItemLabel.PAGE_HEADER: "page-header",
59
+ DocItemLabel.PAGE_FOOTER: "page-footer",
60
+ DocItemLabel.FOOTNOTE: "footnote",
61
+ DocItemLabel.TABLE: "table",
62
+ DocItemLabel.FORMULA: "equation",
63
+ DocItemLabel.LIST_ITEM: "paragraph",
64
+ DocItemLabel.CODE: "paragraph",
65
+ DocItemLabel.PICTURE: "figure",
66
+ DocItemLabel.TEXT: "paragraph",
67
+ DocItemLabel.PARAGRAPH: "paragraph",
52
68
  }
53
69
 
54
- _EMPTY_DOC = DsDocument(
55
- _name="",
56
- description=DsDocumentDescription(logs=[]),
57
- file_info=DsFileInfoObject(
58
- filename="",
59
- document_hash="",
60
- ),
61
- )
70
+ _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
62
71
 
63
72
 
64
73
  class InputDocument(BaseModel):
65
- file: PurePath = None
66
- document_hash: Optional[str] = None
67
- valid: bool = False
74
+ file: PurePath
75
+ document_hash: str # = None
76
+ valid: bool = True
68
77
  limits: DocumentLimits = DocumentLimits()
78
+ format: InputFormat # = None
69
79
 
70
80
  filesize: Optional[int] = None
71
- page_count: Optional[int] = None
81
+ page_count: int = 0
72
82
 
73
- _backend: PdfDocumentBackend = None # Internal PDF backend used
83
+ _backend: AbstractDocumentBackend # Internal PDF backend used
74
84
 
75
85
  def __init__(
76
86
  self,
77
87
  path_or_stream: Union[BytesIO, Path],
88
+ format: InputFormat,
89
+ backend: Type[AbstractDocumentBackend],
78
90
  filename: Optional[str] = None,
79
91
  limits: Optional[DocumentLimits] = None,
80
- pdf_backend=DoclingParseDocumentBackend,
81
92
  ):
82
- super().__init__()
93
+ super().__init__(
94
+ file="", document_hash="", format=InputFormat.PDF
95
+ ) # initialize with dummy values
83
96
 
84
97
  self.limits = limits or DocumentLimits()
98
+ self.format = format
85
99
 
86
100
  try:
87
101
  if isinstance(path_or_stream, Path):
@@ -91,11 +105,12 @@ class InputDocument(BaseModel):
91
105
  self.valid = False
92
106
  else:
93
107
  self.document_hash = create_file_hash(path_or_stream)
94
- self._backend = pdf_backend(
95
- path_or_stream=path_or_stream, document_hash=self.document_hash
96
- )
108
+ self._init_doc(backend, path_or_stream)
97
109
 
98
110
  elif isinstance(path_or_stream, BytesIO):
111
+ assert (
112
+ filename is not None
113
+ ), "Can't construct InputDocument from stream without providing filename arg."
99
114
  self.file = PurePath(filename)
100
115
  self.filesize = path_or_stream.getbuffer().nbytes
101
116
 
@@ -103,15 +118,20 @@ class InputDocument(BaseModel):
103
118
  self.valid = False
104
119
  else:
105
120
  self.document_hash = create_file_hash(path_or_stream)
106
- self._backend = pdf_backend(
107
- path_or_stream=path_or_stream, document_hash=self.document_hash
108
- )
109
-
110
- if self.document_hash and self._backend.page_count() > 0:
111
- self.page_count = self._backend.page_count()
121
+ self._init_doc(backend, path_or_stream)
122
+ else:
123
+ raise RuntimeError(
124
+ f"Unexpected type path_or_stream: {type(path_or_stream)}"
125
+ )
112
126
 
113
- if self.page_count <= self.limits.max_num_pages:
114
- self.valid = True
127
+ # For paginated backends, check if the maximum page count is exceeded.
128
+ if self.valid and self._backend.is_valid():
129
+ if self._backend.supports_pagination() and isinstance(
130
+ self._backend, PaginatedDocumentBackend
131
+ ):
132
+ self.page_count = self._backend.page_count()
133
+ if not self.page_count <= self.limits.max_num_pages:
134
+ self.valid = False
115
135
 
116
136
  except (FileNotFoundError, OSError) as e:
117
137
  _log.exception(
@@ -125,9 +145,26 @@ class InputDocument(BaseModel):
125
145
  )
126
146
  # raise
127
147
 
148
+ def _init_doc(
149
+ self,
150
+ backend: Type[AbstractDocumentBackend],
151
+ path_or_stream: Union[BytesIO, Path],
152
+ ) -> None:
153
+ if backend is None:
154
+ raise RuntimeError(
155
+ f"No backend configuration provided for file {self.file.name} with format {self.format}. "
156
+ f"Please check your format configuration on DocumentConverter."
157
+ )
158
+
159
+ self._backend = backend(self, path_or_stream=path_or_stream)
160
+
128
161
 
129
- @deprecated("Use `ConversionResult` instead.")
130
- class ConvertedDocument(BaseModel):
162
+ class DocumentFormat(str, Enum):
163
+ V2 = "v2"
164
+ V1 = "v1"
165
+
166
+
167
+ class ConversionResult(BaseModel):
131
168
  input: InputDocument
132
169
 
133
170
  status: ConversionStatus = ConversionStatus.PENDING # failure, success
@@ -136,15 +173,42 @@ class ConvertedDocument(BaseModel):
136
173
  pages: List[Page] = []
137
174
  assembled: AssembledUnit = AssembledUnit()
138
175
 
139
- output: DsDocument = _EMPTY_DOC
176
+ document: DoclingDocument = _EMPTY_DOCLING_DOC
177
+
178
+ @property
179
+ @deprecated("Use document instead.")
180
+ def legacy_document(self):
181
+ reverse_label_mapping = {
182
+ DocItemLabel.CAPTION.value: "Caption",
183
+ DocItemLabel.FOOTNOTE.value: "Footnote",
184
+ DocItemLabel.FORMULA.value: "Formula",
185
+ DocItemLabel.LIST_ITEM.value: "List-item",
186
+ DocItemLabel.PAGE_FOOTER.value: "Page-footer",
187
+ DocItemLabel.PAGE_HEADER.value: "Page-header",
188
+ DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
189
+ DocItemLabel.SECTION_HEADER.value: "Section-header",
190
+ DocItemLabel.TABLE.value: "Table",
191
+ DocItemLabel.TEXT.value: "Text",
192
+ DocItemLabel.TITLE.value: "Title",
193
+ DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
194
+ DocItemLabel.CODE.value: "Code",
195
+ DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
196
+ DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
197
+ DocItemLabel.FORM.value: "Form",
198
+ DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
199
+ DocItemLabel.PARAGRAPH.value: "paragraph",
200
+ }
140
201
 
141
- def _to_ds_document(self) -> DsDocument:
142
202
  title = ""
143
203
  desc = DsDocumentDescription(logs=[])
144
204
 
145
205
  page_hashes = [
146
- PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
147
- for p in self.pages
206
+ PageReference(
207
+ hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
208
+ page=p.page_no,
209
+ model="default",
210
+ )
211
+ for p in self.document.pages.values()
148
212
  ]
149
213
 
150
214
  file_info = DsFileInfoObject(
@@ -157,145 +221,199 @@ class ConvertedDocument(BaseModel):
157
221
  main_text = []
158
222
  tables = []
159
223
  figures = []
224
+ equations = []
225
+ footnotes = []
226
+ page_headers = []
227
+ page_footers = []
228
+
229
+ embedded_captions = set()
230
+ for ix, (item, level) in enumerate(
231
+ self.document.iterate_items(self.document.body)
232
+ ):
233
+
234
+ if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
235
+ caption = item.caption_text(self.document)
236
+ if caption:
237
+ embedded_captions.add(caption)
238
+
239
+ for item, level in self.document.iterate_items():
240
+ if isinstance(item, DocItem):
241
+ item_type = item.label
242
+
243
+ if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
244
+
245
+ if isinstance(item, ListItem) and item.marker:
246
+ text = f"{item.marker} {item.text}"
247
+ else:
248
+ text = item.text
249
+
250
+ # Can be empty.
251
+ prov = [
252
+ Prov(
253
+ bbox=p.bbox.as_tuple(),
254
+ page=p.page_no,
255
+ span=[0, len(item.text)],
256
+ )
257
+ for p in item.prov
258
+ ]
259
+ main_text.append(
260
+ BaseText(
261
+ text=text,
262
+ obj_type=layout_label_to_ds_type.get(item.label),
263
+ name=reverse_label_mapping[item.label],
264
+ prov=prov,
265
+ )
266
+ )
160
267
 
161
- page_no_to_page = {p.page_no: p for p in self.pages}
162
-
163
- for element in self.assembled.elements:
164
- # Convert bboxes to lower-left origin.
165
- target_bbox = DsBoundingBox(
166
- element.cluster.bbox.to_bottom_left_origin(
167
- page_no_to_page[element.page_no].size.height
168
- ).as_tuple()
169
- )
170
-
171
- if isinstance(element, TextElement):
172
- main_text.append(
173
- BaseText(
174
- text=element.text,
175
- obj_type=layout_label_to_ds_type.get(element.label),
176
- name=element.label,
177
- prov=[
178
- Prov(
179
- bbox=target_bbox,
180
- page=element.page_no + 1,
181
- span=[0, len(element.text)],
182
- )
183
- ],
268
+ # skip captions of they are embedded in the actual
269
+ # floating object
270
+ if item_type == DocItemLabel.CAPTION and text in embedded_captions:
271
+ continue
272
+
273
+ elif isinstance(item, TableItem) and item.data:
274
+ index = len(tables)
275
+ ref_str = f"#/tables/{index}"
276
+ main_text.append(
277
+ Ref(
278
+ name=reverse_label_mapping[item.label],
279
+ obj_type=layout_label_to_ds_type.get(item.label),
280
+ ref=ref_str,
281
+ ),
184
282
  )
185
- )
186
- elif isinstance(element, TableElement):
187
- index = len(tables)
188
- ref_str = f"#/tables/{index}"
189
- main_text.append(
190
- Ref(
191
- name=element.label,
192
- obj_type=layout_label_to_ds_type.get(element.label),
193
- ref=ref_str,
194
- ),
195
- )
196
283
 
197
- # Initialise empty table data grid (only empty cells)
198
- table_data = [
199
- [
200
- TableCell(
201
- text="",
202
- # bbox=[0,0,0,0],
203
- spans=[[i, j]],
204
- obj_type="body",
205
- )
206
- for j in range(element.num_cols)
284
+ # Initialise empty table data grid (only empty cells)
285
+ table_data = [
286
+ [
287
+ TableCell(
288
+ text="",
289
+ # bbox=[0,0,0,0],
290
+ spans=[[i, j]],
291
+ obj_type="body",
292
+ )
293
+ for j in range(item.data.num_cols)
294
+ ]
295
+ for i in range(item.data.num_rows)
207
296
  ]
208
- for i in range(element.num_rows)
209
- ]
210
-
211
- # Overwrite cells in table data for which there is actual cell content.
212
- for cell in element.table_cells:
213
- for i in range(
214
- min(cell.start_row_offset_idx, element.num_rows),
215
- min(cell.end_row_offset_idx, element.num_rows),
216
- ):
217
- for j in range(
218
- min(cell.start_col_offset_idx, element.num_cols),
219
- min(cell.end_col_offset_idx, element.num_cols),
297
+
298
+ # Overwrite cells in table data for which there is actual cell content.
299
+ for cell in item.data.table_cells:
300
+ for i in range(
301
+ min(cell.start_row_offset_idx, item.data.num_rows),
302
+ min(cell.end_row_offset_idx, item.data.num_rows),
220
303
  ):
221
- celltype = "body"
222
- if cell.column_header:
223
- celltype = "col_header"
224
- elif cell.row_header:
225
- celltype = "row_header"
226
- elif cell.row_section:
227
- celltype = "row_section"
228
-
229
- def make_spans(cell):
230
- for rspan in range(
231
- min(cell.start_row_offset_idx, element.num_rows),
232
- min(cell.end_row_offset_idx, element.num_rows),
233
- ):
234
- for cspan in range(
304
+ for j in range(
305
+ min(cell.start_col_offset_idx, item.data.num_cols),
306
+ min(cell.end_col_offset_idx, item.data.num_cols),
307
+ ):
308
+ celltype = "body"
309
+ if cell.column_header:
310
+ celltype = "col_header"
311
+ elif cell.row_header:
312
+ celltype = "row_header"
313
+ elif cell.row_section:
314
+ celltype = "row_section"
315
+
316
+ def make_spans(cell):
317
+ for rspan in range(
318
+ min(
319
+ cell.start_row_offset_idx,
320
+ item.data.num_rows,
321
+ ),
235
322
  min(
236
- cell.start_col_offset_idx, element.num_cols
323
+ cell.end_row_offset_idx, item.data.num_rows
237
324
  ),
238
- min(cell.end_col_offset_idx, element.num_cols),
239
325
  ):
240
- yield [rspan, cspan]
241
-
242
- spans = list(make_spans(cell))
243
- table_data[i][j] = TableCell(
244
- text=cell.text,
245
- bbox=cell.bbox.to_bottom_left_origin(
246
- page_no_to_page[element.page_no].size.height
247
- ).as_tuple(),
248
- # col=j,
249
- # row=i,
250
- spans=spans,
251
- obj_type=celltype,
252
- # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
253
- # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
254
- )
326
+ for cspan in range(
327
+ min(
328
+ cell.start_col_offset_idx,
329
+ item.data.num_cols,
330
+ ),
331
+ min(
332
+ cell.end_col_offset_idx,
333
+ item.data.num_cols,
334
+ ),
335
+ ):
336
+ yield [rspan, cspan]
337
+
338
+ spans = list(make_spans(cell))
339
+ table_data[i][j] = GlmTableCell(
340
+ text=cell.text,
341
+ bbox=(
342
+ cell.bbox.as_tuple()
343
+ if cell.bbox is not None
344
+ else None
345
+ ), # check if this is bottom-left
346
+ spans=spans,
347
+ obj_type=celltype,
348
+ col=j,
349
+ row=i,
350
+ row_header=cell.row_header,
351
+ row_section=cell.row_section,
352
+ col_header=cell.column_header,
353
+ row_span=[
354
+ cell.start_row_offset_idx,
355
+ cell.end_row_offset_idx,
356
+ ],
357
+ col_span=[
358
+ cell.start_col_offset_idx,
359
+ cell.end_col_offset_idx,
360
+ ],
361
+ )
362
+
363
+ # Compute the caption
364
+ caption = item.caption_text(self.document)
365
+
366
+ tables.append(
367
+ DsSchemaTable(
368
+ text=caption,
369
+ num_cols=item.data.num_cols,
370
+ num_rows=item.data.num_rows,
371
+ obj_type=layout_label_to_ds_type.get(item.label),
372
+ data=table_data,
373
+ prov=[
374
+ Prov(
375
+ bbox=p.bbox.as_tuple(),
376
+ page=p.page_no,
377
+ span=[0, 0],
378
+ )
379
+ for p in item.prov
380
+ ],
381
+ )
382
+ )
255
383
 
256
- tables.append(
257
- DsSchemaTable(
258
- num_cols=element.num_cols,
259
- num_rows=element.num_rows,
260
- obj_type=layout_label_to_ds_type.get(element.label),
261
- data=table_data,
262
- prov=[
263
- Prov(
264
- bbox=target_bbox,
265
- page=element.page_no + 1,
266
- span=[0, 0],
267
- )
268
- ],
384
+ elif isinstance(item, PictureItem):
385
+ index = len(figures)
386
+ ref_str = f"#/figures/{index}"
387
+ main_text.append(
388
+ Ref(
389
+ name=reverse_label_mapping[item.label],
390
+ obj_type=layout_label_to_ds_type.get(item.label),
391
+ ref=ref_str,
392
+ ),
269
393
  )
270
- )
271
394
 
272
- elif isinstance(element, FigureElement):
273
- index = len(figures)
274
- ref_str = f"#/figures/{index}"
275
- main_text.append(
276
- Ref(
277
- name=element.label,
278
- obj_type=layout_label_to_ds_type.get(element.label),
279
- ref=ref_str,
280
- ),
281
- )
282
- figures.append(
283
- Figure(
284
- prov=[
285
- Prov(
286
- bbox=target_bbox,
287
- page=element.page_no + 1,
288
- span=[0, 0],
289
- )
290
- ],
291
- obj_type=layout_label_to_ds_type.get(element.label),
292
- # data=[[]],
395
+ # Compute the caption
396
+ caption = item.caption_text(self.document)
397
+
398
+ figures.append(
399
+ Figure(
400
+ prov=[
401
+ Prov(
402
+ bbox=p.bbox.as_tuple(),
403
+ page=p.page_no,
404
+ span=[0, len(caption)],
405
+ )
406
+ for p in item.prov
407
+ ],
408
+ obj_type=layout_label_to_ds_type.get(item.label),
409
+ text=caption,
410
+ # data=[[]],
411
+ )
293
412
  )
294
- )
295
413
 
296
414
  page_dimensions = [
297
- PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
298
- for p in self.pages
415
+ PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
416
+ for p in self.document.pages.values()
299
417
  ]
300
418
 
301
419
  ds_doc = DsDocument(
@@ -303,6 +421,10 @@ class ConvertedDocument(BaseModel):
303
421
  description=desc,
304
422
  file_info=file_info,
305
423
  main_text=main_text,
424
+ equations=equations,
425
+ footnotes=footnotes,
426
+ page_headers=page_headers,
427
+ page_footers=page_footers,
306
428
  tables=tables,
307
429
  figures=figures,
308
430
  page_dimensions=page_dimensions,
@@ -310,152 +432,76 @@ class ConvertedDocument(BaseModel):
310
432
 
311
433
  return ds_doc
312
434
 
313
- def render_as_dict(self):
314
- return self.output.model_dump(by_alias=True, exclude_none=True)
315
-
316
- def render_as_markdown(
317
- self,
318
- delim: str = "\n\n",
319
- main_text_start: int = 0,
320
- main_text_stop: Optional[int] = None,
321
- main_text_labels: list[str] = [
322
- "title",
323
- "subtitle-level-1",
324
- "paragraph",
325
- "caption",
326
- "table",
327
- "figure",
328
- ],
329
- strict_text: bool = False,
330
- image_placeholder: str = "<!-- image -->",
331
- ):
332
- return self.output.export_to_markdown(
333
- delim=delim,
334
- main_text_start=main_text_start,
335
- main_text_stop=main_text_stop,
336
- main_text_labels=main_text_labels,
337
- strict_text=strict_text,
338
- image_placeholder=image_placeholder,
339
- )
340
-
341
- def render_as_text(
342
- self,
343
- delim: str = "\n\n",
344
- main_text_start: int = 0,
345
- main_text_stop: Optional[int] = None,
346
- main_text_labels: list[str] = [
347
- "title",
348
- "subtitle-level-1",
349
- "paragraph",
350
- "caption",
351
- ],
352
- ):
353
- return self.output.export_to_markdown(
354
- delim=delim,
355
- main_text_start=main_text_start,
356
- main_text_stop=main_text_stop,
357
- main_text_labels=main_text_labels,
358
- strict_text=True,
359
- )
360
-
361
- def render_as_doctags(
362
- self,
363
- delim: str = "\n\n",
364
- main_text_start: int = 0,
365
- main_text_stop: Optional[int] = None,
366
- main_text_labels: list[str] = [
367
- "title",
368
- "subtitle-level-1",
369
- "paragraph",
370
- "caption",
371
- "table",
372
- "figure",
373
- ],
374
- xsize: int = 100,
375
- ysize: int = 100,
376
- add_location: bool = True,
377
- add_content: bool = True,
378
- add_page_index: bool = True,
379
- # table specific flags
380
- add_table_cell_location: bool = False,
381
- add_table_cell_label: bool = True,
382
- add_table_cell_text: bool = True,
383
- ) -> str:
384
- return self.output.export_to_document_tokens(
385
- delim=delim,
386
- main_text_start=main_text_start,
387
- main_text_stop=main_text_stop,
388
- main_text_labels=main_text_labels,
389
- xsize=xsize,
390
- ysize=ysize,
391
- add_location=add_location,
392
- add_content=add_content,
393
- add_page_index=add_page_index,
394
- # table specific flags
395
- add_table_cell_location=add_table_cell_location,
396
- add_table_cell_label=add_table_cell_label,
397
- add_table_cell_text=add_table_cell_text,
398
- )
399
-
400
- def render_element_images(
401
- self, element_types: Tuple[PageElement] = (FigureElement,)
402
- ):
403
- for element in self.assembled.elements:
404
- if isinstance(element, element_types):
405
- page_ix = element.page_no
406
- scale = self.pages[page_ix]._default_image_scale
407
- crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
408
- page_height=self.pages[page_ix].size.height * scale
409
- )
410
-
411
- cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
412
- yield element, cropped_im
413
-
414
435
 
415
- class ConversionResult(ConvertedDocument):
416
- pass
436
+ class _DocumentConversionInput(BaseModel):
417
437
 
418
-
419
- class DocumentConversionInput(BaseModel):
420
-
421
- _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
438
+ path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
422
439
  limits: Optional[DocumentLimits] = DocumentLimits()
423
440
 
424
- DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
425
-
426
441
  def docs(
427
- self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
442
+ self, format_options: Dict[InputFormat, "FormatOption"]
428
443
  ) -> Iterable[InputDocument]:
444
+ for item in self.path_or_stream_iterator:
445
+ obj = resolve_file_source(item) if isinstance(item, str) else item
446
+ format = self._guess_format(obj)
447
+ if format not in format_options.keys():
448
+ _log.info(
449
+ f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
450
+ )
451
+ continue
452
+ else:
453
+ backend = format_options[format].backend
429
454
 
430
- pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
431
-
432
- for obj in self._path_or_stream_iterator:
433
455
  if isinstance(obj, Path):
434
456
  yield InputDocument(
435
- path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
457
+ path_or_stream=obj,
458
+ format=format,
459
+ filename=obj.name,
460
+ limits=self.limits,
461
+ backend=backend,
436
462
  )
437
463
  elif isinstance(obj, DocumentStream):
438
464
  yield InputDocument(
439
465
  path_or_stream=obj.stream,
440
- filename=obj.filename,
466
+ format=format,
467
+ filename=obj.name,
441
468
  limits=self.limits,
442
- pdf_backend=pdf_backend,
469
+ backend=backend,
443
470
  )
444
-
445
- @classmethod
446
- def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
447
- paths = [Path(p) for p in paths]
448
-
449
- doc_input = cls(limits=limits)
450
- doc_input._path_or_stream_iterator = paths
451
-
452
- return doc_input
453
-
454
- @classmethod
455
- def from_streams(
456
- cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
457
- ):
458
- doc_input = cls(limits=limits)
459
- doc_input._path_or_stream_iterator = streams
460
-
461
- return doc_input
471
+ else:
472
+ raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
473
+
474
+ def _guess_format(self, obj):
475
+ content = None
476
+ if isinstance(obj, Path):
477
+ mime = filetype.guess_mime(str(obj))
478
+ if mime is None:
479
+ with obj.open("rb") as f:
480
+ content = f.read(1024) # Read first 1KB
481
+
482
+ elif isinstance(obj, DocumentStream):
483
+ obj.stream.seek(0)
484
+ content = obj.stream.read(8192)
485
+ obj.stream.seek(0)
486
+ mime = filetype.guess_mime(content)
487
+
488
+ if mime is None:
489
+ mime = self._detect_html_xhtml(content)
490
+
491
+ format = MimeTypeToFormat.get(mime)
492
+ return format
493
+
494
+ def _detect_html_xhtml(self, content):
495
+ content_str = content.decode("ascii", errors="ignore").lower()
496
+ # Remove XML comments
497
+ content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
498
+ content_str = content_str.lstrip()
499
+
500
+ if re.match(r"<\?xml", content_str):
501
+ if "xhtml" in content_str[:1000]:
502
+ return "application/xhtml+xml"
503
+
504
+ if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
505
+ return "text/html"
506
+
507
+ return None