docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. docling/backend/abstract_backend.py +33 -37
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +20 -16
  4. docling/backend/docling_parse_v2_backend.py +248 -0
  5. docling/backend/html_backend.py +429 -0
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +398 -0
  8. docling/backend/msword_backend.py +496 -0
  9. docling/backend/pdf_backend.py +78 -0
  10. docling/backend/pypdfium2_backend.py +16 -11
  11. docling/cli/main.py +96 -65
  12. docling/datamodel/base_models.py +79 -193
  13. docling/datamodel/document.py +405 -320
  14. docling/datamodel/pipeline_options.py +19 -3
  15. docling/datamodel/settings.py +16 -1
  16. docling/document_converter.py +240 -251
  17. docling/models/base_model.py +28 -0
  18. docling/models/base_ocr_model.py +40 -10
  19. docling/models/ds_glm_model.py +244 -30
  20. docling/models/easyocr_model.py +57 -42
  21. docling/models/layout_model.py +158 -116
  22. docling/models/page_assemble_model.py +127 -101
  23. docling/models/page_preprocessing_model.py +79 -0
  24. docling/models/table_structure_model.py +162 -116
  25. docling/models/tesseract_ocr_cli_model.py +76 -59
  26. docling/models/tesseract_ocr_model.py +90 -58
  27. docling/pipeline/base_pipeline.py +189 -0
  28. docling/pipeline/simple_pipeline.py +56 -0
  29. docling/pipeline/standard_pdf_pipeline.py +201 -0
  30. docling/utils/export.py +4 -3
  31. docling/utils/layout_utils.py +17 -11
  32. docling/utils/profiling.py +62 -0
  33. docling-2.4.1.dist-info/METADATA +154 -0
  34. docling-2.4.1.dist-info/RECORD +45 -0
  35. docling/pipeline/base_model_pipeline.py +0 -18
  36. docling/pipeline/standard_model_pipeline.py +0 -66
  37. docling-1.19.1.dist-info/METADATA +0 -380
  38. docling-1.19.1.dist-info/RECORD +0 -34
  39. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  40. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  41. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -1,87 +1,113 @@
1
1
  import logging
2
+ import re
3
+ from enum import Enum
2
4
  from io import BytesIO
3
5
  from pathlib import Path, PurePath
4
- from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
5
-
6
- from docling_core.types import BaseCell, BaseText
7
- from docling_core.types import Document as DsDocument
8
- from docling_core.types import DocumentDescription as DsDocumentDescription
9
- from docling_core.types import FileInfoObject as DsFileInfoObject
10
- from docling_core.types import PageDimensions, PageReference, Prov, Ref
11
- from docling_core.types import Table as DsSchemaTable
12
- from docling_core.types import TableCell
13
- from docling_core.types.doc.base import BoundingBox as DsBoundingBox
14
- from docling_core.types.doc.base import Figure
6
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
7
+
8
+ import filetype
9
+ from docling_core.types.doc import (
10
+ DocItem,
11
+ DocItemLabel,
12
+ DoclingDocument,
13
+ PictureItem,
14
+ SectionHeaderItem,
15
+ TableItem,
16
+ TextItem,
17
+ )
18
+ from docling_core.types.doc.document import ListItem
19
+ from docling_core.types.legacy_doc.base import (
20
+ BaseText,
21
+ Figure,
22
+ GlmTableCell,
23
+ PageDimensions,
24
+ PageReference,
25
+ Prov,
26
+ Ref,
27
+ )
28
+ from docling_core.types.legacy_doc.base import Table as DsSchemaTable
29
+ from docling_core.types.legacy_doc.base import TableCell
30
+ from docling_core.types.legacy_doc.document import (
31
+ CCSDocumentDescription as DsDocumentDescription,
32
+ )
33
+ from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
34
+ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
35
+ from docling_core.utils.file import resolve_file_source
15
36
  from pydantic import BaseModel
16
37
  from typing_extensions import deprecated
17
38
 
18
- from docling.backend.abstract_backend import PdfDocumentBackend
19
- from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
39
+ from docling.backend.abstract_backend import (
40
+ AbstractDocumentBackend,
41
+ PaginatedDocumentBackend,
42
+ )
20
43
  from docling.datamodel.base_models import (
21
44
  AssembledUnit,
22
45
  ConversionStatus,
23
46
  DocumentStream,
24
47
  ErrorItem,
25
- FigureElement,
48
+ FormatToExtensions,
49
+ FormatToMimeType,
50
+ InputFormat,
51
+ MimeTypeToFormat,
26
52
  Page,
27
- PageElement,
28
- TableElement,
29
- TextElement,
30
53
  )
31
54
  from docling.datamodel.settings import DocumentLimits
32
- from docling.utils.utils import create_file_hash
55
+ from docling.utils.profiling import ProfilingItem
56
+ from docling.utils.utils import create_file_hash, create_hash
57
+
58
+ if TYPE_CHECKING:
59
+ from docling.document_converter import FormatOption
33
60
 
34
61
  _log = logging.getLogger(__name__)
35
62
 
36
63
  layout_label_to_ds_type = {
37
- "Title": "title",
38
- "Document Index": "table-of-path_or_stream",
39
- "Section-header": "subtitle-level-1",
40
- "Checkbox-Selected": "checkbox-selected",
41
- "Checkbox-Unselected": "checkbox-unselected",
42
- "Caption": "caption",
43
- "Page-header": "page-header",
44
- "Page-footer": "page-footer",
45
- "Footnote": "footnote",
46
- "Table": "table",
47
- "Formula": "equation",
48
- "List-item": "paragraph",
49
- "Code": "paragraph",
50
- "Picture": "figure",
51
- "Text": "paragraph",
64
+ DocItemLabel.TITLE: "title",
65
+ DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
66
+ DocItemLabel.SECTION_HEADER: "subtitle-level-1",
67
+ DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
68
+ DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
69
+ DocItemLabel.CAPTION: "caption",
70
+ DocItemLabel.PAGE_HEADER: "page-header",
71
+ DocItemLabel.PAGE_FOOTER: "page-footer",
72
+ DocItemLabel.FOOTNOTE: "footnote",
73
+ DocItemLabel.TABLE: "table",
74
+ DocItemLabel.FORMULA: "equation",
75
+ DocItemLabel.LIST_ITEM: "paragraph",
76
+ DocItemLabel.CODE: "paragraph",
77
+ DocItemLabel.PICTURE: "figure",
78
+ DocItemLabel.TEXT: "paragraph",
79
+ DocItemLabel.PARAGRAPH: "paragraph",
52
80
  }
53
81
 
54
- _EMPTY_DOC = DsDocument(
55
- _name="",
56
- description=DsDocumentDescription(logs=[]),
57
- file_info=DsFileInfoObject(
58
- filename="",
59
- document_hash="",
60
- ),
61
- )
82
+ _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
62
83
 
63
84
 
64
85
  class InputDocument(BaseModel):
65
- file: PurePath = None
66
- document_hash: Optional[str] = None
67
- valid: bool = False
86
+ file: PurePath
87
+ document_hash: str # = None
88
+ valid: bool = True
68
89
  limits: DocumentLimits = DocumentLimits()
90
+ format: InputFormat # = None
69
91
 
70
92
  filesize: Optional[int] = None
71
- page_count: Optional[int] = None
93
+ page_count: int = 0
72
94
 
73
- _backend: PdfDocumentBackend = None # Internal PDF backend used
95
+ _backend: AbstractDocumentBackend # Internal PDF backend used
74
96
 
75
97
  def __init__(
76
98
  self,
77
99
  path_or_stream: Union[BytesIO, Path],
100
+ format: InputFormat,
101
+ backend: Type[AbstractDocumentBackend],
78
102
  filename: Optional[str] = None,
79
103
  limits: Optional[DocumentLimits] = None,
80
- pdf_backend=DoclingParseDocumentBackend,
81
104
  ):
82
- super().__init__()
105
+ super().__init__(
106
+ file="", document_hash="", format=InputFormat.PDF
107
+ ) # initialize with dummy values
83
108
 
84
109
  self.limits = limits or DocumentLimits()
110
+ self.format = format
85
111
 
86
112
  try:
87
113
  if isinstance(path_or_stream, Path):
@@ -91,11 +117,12 @@ class InputDocument(BaseModel):
91
117
  self.valid = False
92
118
  else:
93
119
  self.document_hash = create_file_hash(path_or_stream)
94
- self._backend = pdf_backend(
95
- path_or_stream=path_or_stream, document_hash=self.document_hash
96
- )
120
+ self._init_doc(backend, path_or_stream)
97
121
 
98
122
  elif isinstance(path_or_stream, BytesIO):
123
+ assert (
124
+ filename is not None
125
+ ), "Can't construct InputDocument from stream without providing filename arg."
99
126
  self.file = PurePath(filename)
100
127
  self.filesize = path_or_stream.getbuffer().nbytes
101
128
 
@@ -103,31 +130,57 @@ class InputDocument(BaseModel):
103
130
  self.valid = False
104
131
  else:
105
132
  self.document_hash = create_file_hash(path_or_stream)
106
- self._backend = pdf_backend(
107
- path_or_stream=path_or_stream, document_hash=self.document_hash
108
- )
109
-
110
- if self.document_hash and self._backend.page_count() > 0:
111
- self.page_count = self._backend.page_count()
133
+ self._init_doc(backend, path_or_stream)
134
+ else:
135
+ raise RuntimeError(
136
+ f"Unexpected type path_or_stream: {type(path_or_stream)}"
137
+ )
112
138
 
113
- if self.page_count <= self.limits.max_num_pages:
114
- self.valid = True
139
+ # For paginated backends, check if the maximum page count is exceeded.
140
+ if self.valid and self._backend.is_valid():
141
+ if self._backend.supports_pagination() and isinstance(
142
+ self._backend, PaginatedDocumentBackend
143
+ ):
144
+ self.page_count = self._backend.page_count()
145
+ if not self.page_count <= self.limits.max_num_pages:
146
+ self.valid = False
115
147
 
116
148
  except (FileNotFoundError, OSError) as e:
149
+ self.valid = False
117
150
  _log.exception(
118
151
  f"File {self.file.name} not found or cannot be opened.", exc_info=e
119
152
  )
120
153
  # raise
121
154
  except RuntimeError as e:
155
+ self.valid = False
122
156
  _log.exception(
123
157
  f"An unexpected error occurred while opening the document {self.file.name}",
124
158
  exc_info=e,
125
159
  )
126
160
  # raise
127
161
 
162
+ def _init_doc(
163
+ self,
164
+ backend: Type[AbstractDocumentBackend],
165
+ path_or_stream: Union[BytesIO, Path],
166
+ ) -> None:
167
+ if backend is None:
168
+ raise RuntimeError(
169
+ f"No backend configuration provided for file {self.file.name} with format {self.format}. "
170
+ f"Please check your format configuration on DocumentConverter."
171
+ )
172
+
173
+ self._backend = backend(self, path_or_stream=path_or_stream)
174
+ if not self._backend.is_valid():
175
+ self.valid = False
128
176
 
129
- @deprecated("Use `ConversionResult` instead.")
130
- class ConvertedDocument(BaseModel):
177
+
178
+ class DocumentFormat(str, Enum):
179
+ V2 = "v2"
180
+ V1 = "v1"
181
+
182
+
183
+ class ConversionResult(BaseModel):
131
184
  input: InputDocument
132
185
 
133
186
  status: ConversionStatus = ConversionStatus.PENDING # failure, success
@@ -135,16 +188,44 @@ class ConvertedDocument(BaseModel):
135
188
 
136
189
  pages: List[Page] = []
137
190
  assembled: AssembledUnit = AssembledUnit()
191
+ timings: Dict[str, ProfilingItem] = {}
192
+
193
+ document: DoclingDocument = _EMPTY_DOCLING_DOC
194
+
195
+ @property
196
+ @deprecated("Use document instead.")
197
+ def legacy_document(self):
198
+ reverse_label_mapping = {
199
+ DocItemLabel.CAPTION.value: "Caption",
200
+ DocItemLabel.FOOTNOTE.value: "Footnote",
201
+ DocItemLabel.FORMULA.value: "Formula",
202
+ DocItemLabel.LIST_ITEM.value: "List-item",
203
+ DocItemLabel.PAGE_FOOTER.value: "Page-footer",
204
+ DocItemLabel.PAGE_HEADER.value: "Page-header",
205
+ DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
206
+ DocItemLabel.SECTION_HEADER.value: "Section-header",
207
+ DocItemLabel.TABLE.value: "Table",
208
+ DocItemLabel.TEXT.value: "Text",
209
+ DocItemLabel.TITLE.value: "Title",
210
+ DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
211
+ DocItemLabel.CODE.value: "Code",
212
+ DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
213
+ DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
214
+ DocItemLabel.FORM.value: "Form",
215
+ DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
216
+ DocItemLabel.PARAGRAPH.value: "paragraph",
217
+ }
138
218
 
139
- output: DsDocument = _EMPTY_DOC
140
-
141
- def _to_ds_document(self) -> DsDocument:
142
219
  title = ""
143
220
  desc = DsDocumentDescription(logs=[])
144
221
 
145
222
  page_hashes = [
146
- PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
147
- for p in self.pages
223
+ PageReference(
224
+ hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
225
+ page=p.page_no,
226
+ model="default",
227
+ )
228
+ for p in self.document.pages.values()
148
229
  ]
149
230
 
150
231
  file_info = DsFileInfoObject(
@@ -157,145 +238,199 @@ class ConvertedDocument(BaseModel):
157
238
  main_text = []
158
239
  tables = []
159
240
  figures = []
241
+ equations = []
242
+ footnotes = []
243
+ page_headers = []
244
+ page_footers = []
245
+
246
+ embedded_captions = set()
247
+ for ix, (item, level) in enumerate(
248
+ self.document.iterate_items(self.document.body)
249
+ ):
250
+
251
+ if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
252
+ caption = item.caption_text(self.document)
253
+ if caption:
254
+ embedded_captions.add(caption)
255
+
256
+ for item, level in self.document.iterate_items():
257
+ if isinstance(item, DocItem):
258
+ item_type = item.label
259
+
260
+ if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
261
+
262
+ if isinstance(item, ListItem) and item.marker:
263
+ text = f"{item.marker} {item.text}"
264
+ else:
265
+ text = item.text
266
+
267
+ # Can be empty.
268
+ prov = [
269
+ Prov(
270
+ bbox=p.bbox.as_tuple(),
271
+ page=p.page_no,
272
+ span=[0, len(item.text)],
273
+ )
274
+ for p in item.prov
275
+ ]
276
+ main_text.append(
277
+ BaseText(
278
+ text=text,
279
+ obj_type=layout_label_to_ds_type.get(item.label),
280
+ name=reverse_label_mapping[item.label],
281
+ prov=prov,
282
+ )
283
+ )
160
284
 
161
- page_no_to_page = {p.page_no: p for p in self.pages}
162
-
163
- for element in self.assembled.elements:
164
- # Convert bboxes to lower-left origin.
165
- target_bbox = DsBoundingBox(
166
- element.cluster.bbox.to_bottom_left_origin(
167
- page_no_to_page[element.page_no].size.height
168
- ).as_tuple()
169
- )
170
-
171
- if isinstance(element, TextElement):
172
- main_text.append(
173
- BaseText(
174
- text=element.text,
175
- obj_type=layout_label_to_ds_type.get(element.label),
176
- name=element.label,
177
- prov=[
178
- Prov(
179
- bbox=target_bbox,
180
- page=element.page_no + 1,
181
- span=[0, len(element.text)],
182
- )
183
- ],
285
+ # skip captions of they are embedded in the actual
286
+ # floating object
287
+ if item_type == DocItemLabel.CAPTION and text in embedded_captions:
288
+ continue
289
+
290
+ elif isinstance(item, TableItem) and item.data:
291
+ index = len(tables)
292
+ ref_str = f"#/tables/{index}"
293
+ main_text.append(
294
+ Ref(
295
+ name=reverse_label_mapping[item.label],
296
+ obj_type=layout_label_to_ds_type.get(item.label),
297
+ ref=ref_str,
298
+ ),
184
299
  )
185
- )
186
- elif isinstance(element, TableElement):
187
- index = len(tables)
188
- ref_str = f"#/tables/{index}"
189
- main_text.append(
190
- Ref(
191
- name=element.label,
192
- obj_type=layout_label_to_ds_type.get(element.label),
193
- ref=ref_str,
194
- ),
195
- )
196
300
 
197
- # Initialise empty table data grid (only empty cells)
198
- table_data = [
199
- [
200
- TableCell(
201
- text="",
202
- # bbox=[0,0,0,0],
203
- spans=[[i, j]],
204
- obj_type="body",
205
- )
206
- for j in range(element.num_cols)
301
+ # Initialise empty table data grid (only empty cells)
302
+ table_data = [
303
+ [
304
+ TableCell(
305
+ text="",
306
+ # bbox=[0,0,0,0],
307
+ spans=[[i, j]],
308
+ obj_type="body",
309
+ )
310
+ for j in range(item.data.num_cols)
311
+ ]
312
+ for i in range(item.data.num_rows)
207
313
  ]
208
- for i in range(element.num_rows)
209
- ]
210
-
211
- # Overwrite cells in table data for which there is actual cell content.
212
- for cell in element.table_cells:
213
- for i in range(
214
- min(cell.start_row_offset_idx, element.num_rows),
215
- min(cell.end_row_offset_idx, element.num_rows),
216
- ):
217
- for j in range(
218
- min(cell.start_col_offset_idx, element.num_cols),
219
- min(cell.end_col_offset_idx, element.num_cols),
314
+
315
+ # Overwrite cells in table data for which there is actual cell content.
316
+ for cell in item.data.table_cells:
317
+ for i in range(
318
+ min(cell.start_row_offset_idx, item.data.num_rows),
319
+ min(cell.end_row_offset_idx, item.data.num_rows),
220
320
  ):
221
- celltype = "body"
222
- if cell.column_header:
223
- celltype = "col_header"
224
- elif cell.row_header:
225
- celltype = "row_header"
226
- elif cell.row_section:
227
- celltype = "row_section"
228
-
229
- def make_spans(cell):
230
- for rspan in range(
231
- min(cell.start_row_offset_idx, element.num_rows),
232
- min(cell.end_row_offset_idx, element.num_rows),
233
- ):
234
- for cspan in range(
321
+ for j in range(
322
+ min(cell.start_col_offset_idx, item.data.num_cols),
323
+ min(cell.end_col_offset_idx, item.data.num_cols),
324
+ ):
325
+ celltype = "body"
326
+ if cell.column_header:
327
+ celltype = "col_header"
328
+ elif cell.row_header:
329
+ celltype = "row_header"
330
+ elif cell.row_section:
331
+ celltype = "row_section"
332
+
333
+ def make_spans(cell):
334
+ for rspan in range(
335
+ min(
336
+ cell.start_row_offset_idx,
337
+ item.data.num_rows,
338
+ ),
235
339
  min(
236
- cell.start_col_offset_idx, element.num_cols
340
+ cell.end_row_offset_idx, item.data.num_rows
237
341
  ),
238
- min(cell.end_col_offset_idx, element.num_cols),
239
342
  ):
240
- yield [rspan, cspan]
241
-
242
- spans = list(make_spans(cell))
243
- table_data[i][j] = TableCell(
244
- text=cell.text,
245
- bbox=cell.bbox.to_bottom_left_origin(
246
- page_no_to_page[element.page_no].size.height
247
- ).as_tuple(),
248
- # col=j,
249
- # row=i,
250
- spans=spans,
251
- obj_type=celltype,
252
- # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
253
- # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
254
- )
343
+ for cspan in range(
344
+ min(
345
+ cell.start_col_offset_idx,
346
+ item.data.num_cols,
347
+ ),
348
+ min(
349
+ cell.end_col_offset_idx,
350
+ item.data.num_cols,
351
+ ),
352
+ ):
353
+ yield [rspan, cspan]
354
+
355
+ spans = list(make_spans(cell))
356
+ table_data[i][j] = GlmTableCell(
357
+ text=cell.text,
358
+ bbox=(
359
+ cell.bbox.as_tuple()
360
+ if cell.bbox is not None
361
+ else None
362
+ ), # check if this is bottom-left
363
+ spans=spans,
364
+ obj_type=celltype,
365
+ col=j,
366
+ row=i,
367
+ row_header=cell.row_header,
368
+ row_section=cell.row_section,
369
+ col_header=cell.column_header,
370
+ row_span=[
371
+ cell.start_row_offset_idx,
372
+ cell.end_row_offset_idx,
373
+ ],
374
+ col_span=[
375
+ cell.start_col_offset_idx,
376
+ cell.end_col_offset_idx,
377
+ ],
378
+ )
379
+
380
+ # Compute the caption
381
+ caption = item.caption_text(self.document)
382
+
383
+ tables.append(
384
+ DsSchemaTable(
385
+ text=caption,
386
+ num_cols=item.data.num_cols,
387
+ num_rows=item.data.num_rows,
388
+ obj_type=layout_label_to_ds_type.get(item.label),
389
+ data=table_data,
390
+ prov=[
391
+ Prov(
392
+ bbox=p.bbox.as_tuple(),
393
+ page=p.page_no,
394
+ span=[0, 0],
395
+ )
396
+ for p in item.prov
397
+ ],
398
+ )
399
+ )
255
400
 
256
- tables.append(
257
- DsSchemaTable(
258
- num_cols=element.num_cols,
259
- num_rows=element.num_rows,
260
- obj_type=layout_label_to_ds_type.get(element.label),
261
- data=table_data,
262
- prov=[
263
- Prov(
264
- bbox=target_bbox,
265
- page=element.page_no + 1,
266
- span=[0, 0],
267
- )
268
- ],
401
+ elif isinstance(item, PictureItem):
402
+ index = len(figures)
403
+ ref_str = f"#/figures/{index}"
404
+ main_text.append(
405
+ Ref(
406
+ name=reverse_label_mapping[item.label],
407
+ obj_type=layout_label_to_ds_type.get(item.label),
408
+ ref=ref_str,
409
+ ),
269
410
  )
270
- )
271
411
 
272
- elif isinstance(element, FigureElement):
273
- index = len(figures)
274
- ref_str = f"#/figures/{index}"
275
- main_text.append(
276
- Ref(
277
- name=element.label,
278
- obj_type=layout_label_to_ds_type.get(element.label),
279
- ref=ref_str,
280
- ),
281
- )
282
- figures.append(
283
- Figure(
284
- prov=[
285
- Prov(
286
- bbox=target_bbox,
287
- page=element.page_no + 1,
288
- span=[0, 0],
289
- )
290
- ],
291
- obj_type=layout_label_to_ds_type.get(element.label),
292
- # data=[[]],
412
+ # Compute the caption
413
+ caption = item.caption_text(self.document)
414
+
415
+ figures.append(
416
+ Figure(
417
+ prov=[
418
+ Prov(
419
+ bbox=p.bbox.as_tuple(),
420
+ page=p.page_no,
421
+ span=[0, len(caption)],
422
+ )
423
+ for p in item.prov
424
+ ],
425
+ obj_type=layout_label_to_ds_type.get(item.label),
426
+ text=caption,
427
+ # data=[[]],
428
+ )
293
429
  )
294
- )
295
430
 
296
431
  page_dimensions = [
297
- PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
298
- for p in self.pages
432
+ PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
433
+ for p in self.document.pages.values()
299
434
  ]
300
435
 
301
436
  ds_doc = DsDocument(
@@ -303,6 +438,10 @@ class ConvertedDocument(BaseModel):
303
438
  description=desc,
304
439
  file_info=file_info,
305
440
  main_text=main_text,
441
+ equations=equations,
442
+ footnotes=footnotes,
443
+ page_headers=page_headers,
444
+ page_footers=page_footers,
306
445
  tables=tables,
307
446
  figures=figures,
308
447
  page_dimensions=page_dimensions,
@@ -310,152 +449,98 @@ class ConvertedDocument(BaseModel):
310
449
 
311
450
  return ds_doc
312
451
 
313
- def render_as_dict(self):
314
- return self.output.model_dump(by_alias=True, exclude_none=True)
315
-
316
- def render_as_markdown(
317
- self,
318
- delim: str = "\n\n",
319
- main_text_start: int = 0,
320
- main_text_stop: Optional[int] = None,
321
- main_text_labels: list[str] = [
322
- "title",
323
- "subtitle-level-1",
324
- "paragraph",
325
- "caption",
326
- "table",
327
- "figure",
328
- ],
329
- strict_text: bool = False,
330
- image_placeholder: str = "<!-- image -->",
331
- ):
332
- return self.output.export_to_markdown(
333
- delim=delim,
334
- main_text_start=main_text_start,
335
- main_text_stop=main_text_stop,
336
- main_text_labels=main_text_labels,
337
- strict_text=strict_text,
338
- image_placeholder=image_placeholder,
339
- )
340
452
 
341
- def render_as_text(
342
- self,
343
- delim: str = "\n\n",
344
- main_text_start: int = 0,
345
- main_text_stop: Optional[int] = None,
346
- main_text_labels: list[str] = [
347
- "title",
348
- "subtitle-level-1",
349
- "paragraph",
350
- "caption",
351
- ],
352
- ):
353
- return self.output.export_to_markdown(
354
- delim=delim,
355
- main_text_start=main_text_start,
356
- main_text_stop=main_text_stop,
357
- main_text_labels=main_text_labels,
358
- strict_text=True,
359
- )
360
-
361
- def render_as_doctags(
362
- self,
363
- delim: str = "\n\n",
364
- main_text_start: int = 0,
365
- main_text_stop: Optional[int] = None,
366
- main_text_labels: list[str] = [
367
- "title",
368
- "subtitle-level-1",
369
- "paragraph",
370
- "caption",
371
- "table",
372
- "figure",
373
- ],
374
- xsize: int = 100,
375
- ysize: int = 100,
376
- add_location: bool = True,
377
- add_content: bool = True,
378
- add_page_index: bool = True,
379
- # table specific flags
380
- add_table_cell_location: bool = False,
381
- add_table_cell_label: bool = True,
382
- add_table_cell_text: bool = True,
383
- ) -> str:
384
- return self.output.export_to_document_tokens(
385
- delim=delim,
386
- main_text_start=main_text_start,
387
- main_text_stop=main_text_stop,
388
- main_text_labels=main_text_labels,
389
- xsize=xsize,
390
- ysize=ysize,
391
- add_location=add_location,
392
- add_content=add_content,
393
- add_page_index=add_page_index,
394
- # table specific flags
395
- add_table_cell_location=add_table_cell_location,
396
- add_table_cell_label=add_table_cell_label,
397
- add_table_cell_text=add_table_cell_text,
398
- )
453
+ class _DocumentConversionInput(BaseModel):
399
454
 
400
- def render_element_images(
401
- self, element_types: Tuple[PageElement] = (FigureElement,)
402
- ):
403
- for element in self.assembled.elements:
404
- if isinstance(element, element_types):
405
- page_ix = element.page_no
406
- scale = self.pages[page_ix]._default_image_scale
407
- crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
408
- page_height=self.pages[page_ix].size.height * scale
409
- )
410
-
411
- cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
412
- yield element, cropped_im
413
-
414
-
415
- class ConversionResult(ConvertedDocument):
416
- pass
417
-
418
-
419
- class DocumentConversionInput(BaseModel):
420
-
421
- _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
455
+ path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
422
456
  limits: Optional[DocumentLimits] = DocumentLimits()
423
457
 
424
- DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
425
-
426
458
  def docs(
427
- self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
459
+ self, format_options: Dict[InputFormat, "FormatOption"]
428
460
  ) -> Iterable[InputDocument]:
461
+ for item in self.path_or_stream_iterator:
462
+ obj = resolve_file_source(item) if isinstance(item, str) else item
463
+ format = self._guess_format(obj)
464
+ if format not in format_options.keys():
465
+ _log.info(
466
+ f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
467
+ )
468
+ continue
469
+ else:
470
+ backend = format_options[format].backend
429
471
 
430
- pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
431
-
432
- for obj in self._path_or_stream_iterator:
433
472
  if isinstance(obj, Path):
434
473
  yield InputDocument(
435
- path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
474
+ path_or_stream=obj,
475
+ format=format,
476
+ filename=obj.name,
477
+ limits=self.limits,
478
+ backend=backend,
436
479
  )
437
480
  elif isinstance(obj, DocumentStream):
438
481
  yield InputDocument(
439
482
  path_or_stream=obj.stream,
440
- filename=obj.filename,
483
+ format=format,
484
+ filename=obj.name,
441
485
  limits=self.limits,
442
- pdf_backend=pdf_backend,
486
+ backend=backend,
487
+ )
488
+ else:
489
+ raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
490
+
491
+ def _guess_format(self, obj: Union[Path, DocumentStream]):
492
+ content = b"" # empty binary blob
493
+ format = None
494
+
495
+ if isinstance(obj, Path):
496
+ mime = filetype.guess_mime(str(obj))
497
+ if mime is None:
498
+ ext = obj.suffix[1:]
499
+ mime = self._mime_from_extension(ext)
500
+ if mime is None: # must guess from
501
+ with obj.open("rb") as f:
502
+ content = f.read(1024) # Read first 1KB
503
+
504
+ elif isinstance(obj, DocumentStream):
505
+ content = obj.stream.read(8192)
506
+ obj.stream.seek(0)
507
+ mime = filetype.guess_mime(content)
508
+ if mime is None:
509
+ ext = (
510
+ obj.name.rsplit(".", 1)[-1]
511
+ if ("." in obj.name and not obj.name.startswith("."))
512
+ else ""
443
513
  )
514
+ mime = self._mime_from_extension(ext)
444
515
 
445
- @classmethod
446
- def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
447
- paths = [Path(p) for p in paths]
516
+ mime = mime or self._detect_html_xhtml(content)
517
+ mime = mime or "text/plain"
448
518
 
449
- doc_input = cls(limits=limits)
450
- doc_input._path_or_stream_iterator = paths
519
+ format = MimeTypeToFormat.get(mime)
520
+ return format
451
521
 
452
- return doc_input
522
+ def _mime_from_extension(self, ext):
523
+ mime = None
524
+ if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
525
+ mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
526
+ elif ext in FormatToExtensions[InputFormat.HTML]:
527
+ mime = FormatToMimeType[InputFormat.HTML][0]
528
+ elif ext in FormatToExtensions[InputFormat.MD]:
529
+ mime = FormatToMimeType[InputFormat.MD][0]
453
530
 
454
- @classmethod
455
- def from_streams(
456
- cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
457
- ):
458
- doc_input = cls(limits=limits)
459
- doc_input._path_or_stream_iterator = streams
531
+ return mime
532
+
533
+ def _detect_html_xhtml(self, content):
534
+ content_str = content.decode("ascii", errors="ignore").lower()
535
+ # Remove XML comments
536
+ content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
537
+ content_str = content_str.lstrip()
538
+
539
+ if re.match(r"<\?xml", content_str):
540
+ if "xhtml" in content_str[:1000]:
541
+ return "application/xhtml+xml"
542
+
543
+ if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
544
+ return "text/html"
460
545
 
461
- return doc_input
546
+ return None