docling 2.8.2__tar.gz → 2.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {docling-2.8.2 → docling-2.9.0}/PKG-INFO +5 -5
  2. {docling-2.8.2 → docling-2.9.0}/README.md +2 -2
  3. {docling-2.8.2 → docling-2.9.0}/docling/backend/msword_backend.py +43 -27
  4. docling-2.9.0/docling/chunking/__init__.py +12 -0
  5. {docling-2.8.2 → docling-2.9.0}/docling/cli/main.py +76 -23
  6. {docling-2.8.2 → docling-2.9.0}/docling/datamodel/base_models.py +3 -0
  7. {docling-2.8.2 → docling-2.9.0}/docling/datamodel/document.py +24 -10
  8. {docling-2.8.2 → docling-2.9.0}/docling/document_converter.py +103 -83
  9. docling-2.9.0/docling/exceptions.py +6 -0
  10. docling-2.9.0/docling/py.typed +1 -0
  11. {docling-2.8.2 → docling-2.9.0}/pyproject.toml +3 -3
  12. {docling-2.8.2 → docling-2.9.0}/LICENSE +0 -0
  13. {docling-2.8.2 → docling-2.9.0}/docling/__init__.py +0 -0
  14. {docling-2.8.2 → docling-2.9.0}/docling/backend/__init__.py +0 -0
  15. {docling-2.8.2 → docling-2.9.0}/docling/backend/abstract_backend.py +0 -0
  16. {docling-2.8.2 → docling-2.9.0}/docling/backend/asciidoc_backend.py +0 -0
  17. {docling-2.8.2 → docling-2.9.0}/docling/backend/docling_parse_backend.py +0 -0
  18. {docling-2.8.2 → docling-2.9.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  19. {docling-2.8.2 → docling-2.9.0}/docling/backend/html_backend.py +0 -0
  20. {docling-2.8.2 → docling-2.9.0}/docling/backend/md_backend.py +0 -0
  21. {docling-2.8.2 → docling-2.9.0}/docling/backend/msexcel_backend.py +0 -0
  22. {docling-2.8.2 → docling-2.9.0}/docling/backend/mspowerpoint_backend.py +0 -0
  23. {docling-2.8.2 → docling-2.9.0}/docling/backend/pdf_backend.py +0 -0
  24. {docling-2.8.2 → docling-2.9.0}/docling/backend/pypdfium2_backend.py +0 -0
  25. {docling-2.8.2 → docling-2.9.0}/docling/cli/__init__.py +0 -0
  26. {docling-2.8.2 → docling-2.9.0}/docling/datamodel/__init__.py +0 -0
  27. {docling-2.8.2 → docling-2.9.0}/docling/datamodel/pipeline_options.py +0 -0
  28. {docling-2.8.2 → docling-2.9.0}/docling/datamodel/settings.py +0 -0
  29. {docling-2.8.2 → docling-2.9.0}/docling/models/__init__.py +0 -0
  30. {docling-2.8.2 → docling-2.9.0}/docling/models/base_model.py +0 -0
  31. {docling-2.8.2 → docling-2.9.0}/docling/models/base_ocr_model.py +0 -0
  32. {docling-2.8.2 → docling-2.9.0}/docling/models/ds_glm_model.py +0 -0
  33. {docling-2.8.2 → docling-2.9.0}/docling/models/easyocr_model.py +0 -0
  34. {docling-2.8.2 → docling-2.9.0}/docling/models/layout_model.py +0 -0
  35. {docling-2.8.2 → docling-2.9.0}/docling/models/ocr_mac_model.py +0 -0
  36. {docling-2.8.2 → docling-2.9.0}/docling/models/page_assemble_model.py +0 -0
  37. {docling-2.8.2 → docling-2.9.0}/docling/models/page_preprocessing_model.py +0 -0
  38. {docling-2.8.2 → docling-2.9.0}/docling/models/rapid_ocr_model.py +0 -0
  39. {docling-2.8.2 → docling-2.9.0}/docling/models/table_structure_model.py +0 -0
  40. {docling-2.8.2 → docling-2.9.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  41. {docling-2.8.2 → docling-2.9.0}/docling/models/tesseract_ocr_model.py +0 -0
  42. {docling-2.8.2 → docling-2.9.0}/docling/pipeline/__init__.py +0 -0
  43. {docling-2.8.2 → docling-2.9.0}/docling/pipeline/base_pipeline.py +0 -0
  44. {docling-2.8.2 → docling-2.9.0}/docling/pipeline/simple_pipeline.py +0 -0
  45. {docling-2.8.2 → docling-2.9.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  46. {docling-2.8.2 → docling-2.9.0}/docling/utils/__init__.py +0 -0
  47. {docling-2.8.2 → docling-2.9.0}/docling/utils/export.py +0 -0
  48. {docling-2.8.2 → docling-2.9.0}/docling/utils/layout_utils.py +0 -0
  49. {docling-2.8.2 → docling-2.9.0}/docling/utils/profiling.py +0 -0
  50. {docling-2.8.2 → docling-2.9.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.8.2
3
+ Version: 2.9.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
28
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
29
- Requires-Dist: docling-core (>=2.6.1,<3.0.0)
29
+ Requires-Dist: docling-core[chunking] (>=2.8.0,<3.0.0)
30
30
  Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
31
31
  Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -39,7 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
39
39
  Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
40
40
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
41
41
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
42
- Requires-Dist: pydantic (>=2.0.0,<2.10)
42
+ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
43
43
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
44
44
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
45
45
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
@@ -59,7 +59,7 @@ Description-Content-Type: text/markdown
59
59
  </a>
60
60
  </p>
61
61
 
62
- # 🦆 Docling
62
+ # Docling
63
63
 
64
64
  <p align="center">
65
65
  <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@@ -81,7 +81,7 @@ Docling parses documents and exports them to the desired format with ease and sp
81
81
 
82
82
  ## Features
83
83
 
84
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
84
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
85
85
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
86
86
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
87
87
  * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
@@ -4,7 +4,7 @@
4
4
  </a>
5
5
  </p>
6
6
 
7
- # 🦆 Docling
7
+ # Docling
8
8
 
9
9
  <p align="center">
10
10
  <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@@ -26,7 +26,7 @@ Docling parses documents and exports them to the desired format with ease and sp
26
26
 
27
27
  ## Features
28
28
 
29
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
29
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
30
30
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
31
31
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
32
32
  * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import re
2
3
  from io import BytesIO
3
4
  from pathlib import Path
4
5
  from typing import Set, Union
@@ -133,7 +134,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
133
134
  def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
134
135
  for element in body:
135
136
  tag_name = etree.QName(element).localname
136
-
137
137
  # Check for Inline Images (blip elements)
138
138
  namespaces = {
139
139
  "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
@@ -153,6 +153,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
153
153
  self.handle_pictures(element, docx_obj, drawing_blip, doc)
154
154
  # Check for Text
155
155
  elif tag_name in ["p"]:
156
+ # "tcPr", "sectPr"
156
157
  self.handle_text_elements(element, docx_obj, doc)
157
158
  else:
158
159
  _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
@@ -166,6 +167,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
166
167
  except ValueError:
167
168
  return default
168
169
 
170
+ def split_text_and_number(self, input_string):
171
+ match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
172
+ if match:
173
+ parts = list(filter(None, match.groups()))
174
+ return parts
175
+ else:
176
+ return [input_string]
177
+
169
178
  def get_numId_and_ilvl(self, paragraph):
170
179
  # Access the XML element of the paragraph
171
180
  numPr = paragraph._element.find(
@@ -188,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
188
197
  def get_label_and_level(self, paragraph):
189
198
  if paragraph.style is None:
190
199
  return "Normal", None
191
- label = paragraph.style.name
200
+ label = paragraph.style.style_id
192
201
  if label is None:
193
202
  return "Normal", None
194
203
  if ":" in label:
@@ -197,7 +206,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
197
206
  if len(parts) == 2:
198
207
  return parts[0], int(parts[1])
199
208
 
200
- parts = label.split(" ")
209
+ parts = self.split_text_and_number(label)
201
210
 
202
211
  if "Heading" in label and len(parts) == 2:
203
212
  parts.sort()
@@ -219,14 +228,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
219
228
  if paragraph.text is None:
220
229
  return
221
230
  text = paragraph.text.strip()
222
- # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
223
231
 
224
232
  # Common styles for bullet and numbered lists.
225
233
  # "List Bullet", "List Number", "List Paragraph"
226
234
  # Identify wether list is a numbered list or not
227
235
  # is_numbered = "List Bullet" not in paragraph.style.name
228
236
  is_numbered = False
229
- p_style_name, p_level = self.get_label_and_level(paragraph)
237
+ p_style_id, p_level = self.get_label_and_level(paragraph)
230
238
  numid, ilevel = self.get_numId_and_ilvl(paragraph)
231
239
 
232
240
  if numid == 0:
@@ -238,14 +246,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
238
246
  element,
239
247
  docx_obj,
240
248
  doc,
241
- p_style_name,
249
+ p_style_id,
242
250
  p_level,
243
251
  numid,
244
252
  ilevel,
245
253
  text,
246
254
  is_numbered,
247
255
  )
248
- self.update_history(p_style_name, p_level, numid, ilevel)
256
+ self.update_history(p_style_id, p_level, numid, ilevel)
249
257
  return
250
258
  elif numid is None and self.prev_numid() is not None: # Close list
251
259
  for key, val in self.parents.items():
@@ -253,23 +261,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
253
261
  self.parents[key] = None
254
262
  self.level = self.level_at_new_list - 1
255
263
  self.level_at_new_list = None
256
- if p_style_name in ["Title"]:
264
+ if p_style_id in ["Title"]:
257
265
  for key, val in self.parents.items():
258
266
  self.parents[key] = None
259
267
  self.parents[0] = doc.add_text(
260
268
  parent=None, label=DocItemLabel.TITLE, text=text
261
269
  )
262
- elif "Heading" in p_style_name:
263
- self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
270
+ elif "Heading" in p_style_id:
271
+ self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
264
272
 
265
- elif p_style_name in [
273
+ elif p_style_id in [
266
274
  "Paragraph",
267
275
  "Normal",
268
276
  "Subtitle",
269
277
  "Author",
270
- "Default Text",
271
- "List Paragraph",
272
- "List Bullet",
278
+ "DefaultText",
279
+ "ListParagraph",
280
+ "ListBullet",
273
281
  "Quote",
274
282
  ]:
275
283
  level = self.get_level()
@@ -285,15 +293,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
285
293
  label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
286
294
  )
287
295
 
288
- self.update_history(p_style_name, p_level, numid, ilevel)
296
+ self.update_history(p_style_id, p_level, numid, ilevel)
289
297
  return
290
298
 
291
299
  def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
292
300
  level = self.get_level()
293
301
  if isinstance(curr_level, int):
294
-
295
302
  if curr_level > level:
296
-
297
303
  # add invisible group
298
304
  for i in range(level, curr_level):
299
305
  self.parents[i] = doc.add_group(
@@ -301,9 +307,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
301
307
  label=GroupLabel.SECTION,
302
308
  name=f"header-{i}",
303
309
  )
304
-
305
310
  elif curr_level < level:
306
-
307
311
  # remove the tail
308
312
  for key, val in self.parents.items():
309
313
  if key >= curr_level:
@@ -314,7 +318,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
314
318
  text=text,
315
319
  level=curr_level,
316
320
  )
317
-
318
321
  else:
319
322
  self.parents[self.level] = doc.add_heading(
320
323
  parent=self.parents[self.level - 1],
@@ -328,7 +331,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
328
331
  element,
329
332
  docx_obj,
330
333
  doc,
331
- p_style_name,
334
+ p_style_id,
332
335
  p_level,
333
336
  numid,
334
337
  ilevel,
@@ -346,7 +349,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
346
349
  label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
347
350
  )
348
351
 
349
- # TODO: Set marker and enumerated arguments if this is an enumeration element.
352
+ # Set marker and enumerated arguments if this is an enumeration element.
350
353
  self.listIter += 1
351
354
  if is_numbered:
352
355
  enum_marker = str(self.listIter) + "."
@@ -365,8 +368,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
365
368
  self.level_at_new_list + self.prev_indent() + 1,
366
369
  self.level_at_new_list + ilevel + 1,
367
370
  ):
368
- # TODO: determine if this is an unordered list or an ordered list.
369
- # Set GroupLabel.ORDERED_LIST when it fits.
371
+ # Determine if this is an unordered list or an ordered list.
372
+ # Set GroupLabel.ORDERED_LIST when it fits.
370
373
  self.listIter = 0
371
374
  if is_numbered:
372
375
  self.parents[i] = doc.add_group(
@@ -467,6 +470,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
467
470
  row_span = get_rowspan(cell)
468
471
  col_span = get_colspan(cell)
469
472
 
473
+ cell_text = cell.text
474
+ # In case cell doesn't return text via docx library:
475
+ if len(cell_text) == 0:
476
+ cell_xml = cell._element
477
+
478
+ texts = [""]
479
+ for elem in cell_xml.iter():
480
+ if elem.tag.endswith("t"): # <w:t> tags that contain text
481
+ if elem.text:
482
+ texts.append(elem.text)
483
+ # Join the collected text
484
+ cell_text = " ".join(texts).strip()
485
+
470
486
  # Find the next available column in the grid
471
487
  while table_grid[row_idx][col_idx] is not None:
472
488
  col_idx += 1
@@ -477,15 +493,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
477
493
  table_grid[row_idx + i][col_idx + j] = ""
478
494
 
479
495
  cell = TableCell(
480
- text=cell.text,
496
+ text=cell_text,
481
497
  row_span=row_span,
482
498
  col_span=col_span,
483
499
  start_row_offset_idx=row_idx,
484
500
  end_row_offset_idx=row_idx + row_span,
485
501
  start_col_offset_idx=col_idx,
486
502
  end_col_offset_idx=col_idx + col_span,
487
- col_header=False, # col_header,
488
- row_header=False, # ((not col_header) and html_cell.name=='th')
503
+ col_header=False,
504
+ row_header=False,
489
505
  )
490
506
 
491
507
  data.table_cells.append(cell)
@@ -0,0 +1,12 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
7
+ from docling_core.transforms.chunker.hierarchical_chunker import (
8
+ DocChunk,
9
+ DocMeta,
10
+ HierarchicalChunker,
11
+ )
12
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
@@ -10,7 +10,9 @@ from pathlib import Path
10
10
  from typing import Annotated, Dict, Iterable, List, Optional, Type
11
11
 
12
12
  import typer
13
+ from docling_core.types.doc import ImageRefMode
13
14
  from docling_core.utils.file import resolve_source_to_path
15
+ from pydantic import TypeAdapter, ValidationError
14
16
 
15
17
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
16
18
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -86,9 +88,11 @@ def export_documents(
86
88
  conv_results: Iterable[ConversionResult],
87
89
  output_dir: Path,
88
90
  export_json: bool,
91
+ export_html: bool,
89
92
  export_md: bool,
90
93
  export_txt: bool,
91
94
  export_doctags: bool,
95
+ image_export_mode: ImageRefMode,
92
96
  ):
93
97
 
94
98
  success_count = 0
@@ -99,33 +103,45 @@ def export_documents(
99
103
  success_count += 1
100
104
  doc_filename = conv_res.input.file.stem
101
105
 
102
- # Export Deep Search document JSON format:
106
+ # Export JSON format:
103
107
  if export_json:
104
108
  fname = output_dir / f"{doc_filename}.json"
105
- with fname.open("w", encoding="utf8") as fp:
106
- _log.info(f"writing JSON output to {fname}")
107
- fp.write(json.dumps(conv_res.document.export_to_dict()))
109
+ _log.info(f"writing JSON output to {fname}")
110
+ conv_res.document.save_as_json(
111
+ filename=fname, image_mode=image_export_mode
112
+ )
113
+
114
+ # Export HTML format:
115
+ if export_html:
116
+ fname = output_dir / f"{doc_filename}.html"
117
+ _log.info(f"writing HTML output to {fname}")
118
+ conv_res.document.save_as_html(
119
+ filename=fname, image_mode=image_export_mode
120
+ )
108
121
 
109
122
  # Export Text format:
110
123
  if export_txt:
111
124
  fname = output_dir / f"{doc_filename}.txt"
112
- with fname.open("w", encoding="utf8") as fp:
113
- _log.info(f"writing Text output to {fname}")
114
- fp.write(conv_res.document.export_to_markdown(strict_text=True))
125
+ _log.info(f"writing TXT output to {fname}")
126
+ conv_res.document.save_as_markdown(
127
+ filename=fname,
128
+ strict_text=True,
129
+ image_mode=ImageRefMode.PLACEHOLDER,
130
+ )
115
131
 
116
132
  # Export Markdown format:
117
133
  if export_md:
118
134
  fname = output_dir / f"{doc_filename}.md"
119
- with fname.open("w", encoding="utf8") as fp:
120
- _log.info(f"writing Markdown output to {fname}")
121
- fp.write(conv_res.document.export_to_markdown())
135
+ _log.info(f"writing Markdown output to {fname}")
136
+ conv_res.document.save_as_markdown(
137
+ filename=fname, image_mode=image_export_mode
138
+ )
122
139
 
123
140
  # Export Document Tags format:
124
141
  if export_doctags:
125
142
  fname = output_dir / f"{doc_filename}.doctags"
126
- with fname.open("w", encoding="utf8") as fp:
127
- _log.info(f"writing Doc Tags output to {fname}")
128
- fp.write(conv_res.document.export_to_document_tokens())
143
+ _log.info(f"writing Doc Tags output to {fname}")
144
+ conv_res.document.save_as_document_tokens(filename=fname)
129
145
 
130
146
  else:
131
147
  _log.warning(f"Document {conv_res.input.file} failed to convert.")
@@ -160,6 +176,13 @@ def convert(
160
176
  to_formats: List[OutputFormat] = typer.Option(
161
177
  None, "--to", help="Specify output formats. Defaults to Markdown."
162
178
  ),
179
+ image_export_mode: Annotated[
180
+ ImageRefMode,
181
+ typer.Option(
182
+ ...,
183
+ help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
184
+ ),
185
+ ] = ImageRefMode.EMBEDDED,
163
186
  ocr: Annotated[
164
187
  bool,
165
188
  typer.Option(
@@ -260,24 +283,45 @@ def convert(
260
283
  with tempfile.TemporaryDirectory() as tempdir:
261
284
  input_doc_paths: List[Path] = []
262
285
  for src in input_sources:
263
- source = resolve_source_to_path(source=src, workdir=Path(tempdir))
264
- if not source.exists():
286
+ try:
287
+ # check if we can fetch some remote url
288
+ source = resolve_source_to_path(source=src, workdir=Path(tempdir))
289
+ input_doc_paths.append(source)
290
+ except FileNotFoundError:
265
291
  err_console.print(
266
- f"[red]Error: The input file {source} does not exist.[/red]"
292
+ f"[red]Error: The input file {src} does not exist.[/red]"
267
293
  )
268
294
  raise typer.Abort()
269
- elif source.is_dir():
270
- for fmt in from_formats:
271
- for ext in FormatToExtensions[fmt]:
272
- input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
273
- input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
274
- else:
275
- input_doc_paths.append(source)
295
+ except IsADirectoryError:
296
+ # if the input matches to a file or a folder
297
+ try:
298
+ local_path = TypeAdapter(Path).validate_python(src)
299
+ if local_path.exists() and local_path.is_dir():
300
+ for fmt in from_formats:
301
+ for ext in FormatToExtensions[fmt]:
302
+ input_doc_paths.extend(
303
+ list(local_path.glob(f"**/*.{ext}"))
304
+ )
305
+ input_doc_paths.extend(
306
+ list(local_path.glob(f"**/*.{ext.upper()}"))
307
+ )
308
+ elif local_path.exists():
309
+ input_doc_paths.append(local_path)
310
+ else:
311
+ err_console.print(
312
+ f"[red]Error: The input file {src} does not exist.[/red]"
313
+ )
314
+ raise typer.Abort()
315
+ except Exception as err:
316
+ err_console.print(f"[red]Error: Cannot read the input {src}.[/red]")
317
+ _log.info(err) # will print more details if verbose is activated
318
+ raise typer.Abort()
276
319
 
277
320
  if to_formats is None:
278
321
  to_formats = [OutputFormat.MARKDOWN]
279
322
 
280
323
  export_json = OutputFormat.JSON in to_formats
324
+ export_html = OutputFormat.HTML in to_formats
281
325
  export_md = OutputFormat.MARKDOWN in to_formats
282
326
  export_txt = OutputFormat.TEXT in to_formats
283
327
  export_doctags = OutputFormat.DOCTAGS in to_formats
@@ -309,6 +353,13 @@ def convert(
309
353
  )
310
354
  pipeline_options.table_structure_options.mode = table_mode
311
355
 
356
+ if image_export_mode != ImageRefMode.PLACEHOLDER:
357
+ pipeline_options.generate_page_images = True
358
+ pipeline_options.generate_picture_images = (
359
+ True # FIXME: to be deprecated in verson 3
360
+ )
361
+ pipeline_options.images_scale = 2
362
+
312
363
  if artifacts_path is not None:
313
364
  pipeline_options.artifacts_path = artifacts_path
314
365
 
@@ -343,9 +394,11 @@ def convert(
343
394
  conv_results,
344
395
  output_dir=output,
345
396
  export_json=export_json,
397
+ export_html=export_html,
346
398
  export_md=export_md,
347
399
  export_txt=export_txt,
348
400
  export_doctags=export_doctags,
401
+ image_export_mode=image_export_mode,
349
402
  )
350
403
 
351
404
  end_time = time.time() - start_time
@@ -24,6 +24,7 @@ class ConversionStatus(str, Enum):
24
24
  FAILURE = auto()
25
25
  SUCCESS = auto()
26
26
  PARTIAL_SUCCESS = auto()
27
+ SKIPPED = auto()
27
28
 
28
29
 
29
30
  class InputFormat(str, Enum):
@@ -40,6 +41,7 @@ class InputFormat(str, Enum):
40
41
  class OutputFormat(str, Enum):
41
42
  MARKDOWN = "md"
42
43
  JSON = "json"
44
+ HTML = "html"
43
45
  TEXT = "text"
44
46
  DOCTAGS = "doctags"
45
47
 
@@ -95,6 +97,7 @@ class DoclingComponentType(str, Enum):
95
97
  DOCUMENT_BACKEND = auto()
96
98
  MODEL = auto()
97
99
  DOC_ASSEMBLER = auto()
100
+ USER_INPUT = auto()
98
101
 
99
102
 
100
103
  class ErrorItem(BaseModel):
@@ -3,7 +3,7 @@ import re
3
3
  from enum import Enum
4
4
  from io import BytesIO
5
5
  from pathlib import Path, PurePath
6
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
6
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
7
7
 
8
8
  import filetype
9
9
  from docling_core.types.doc import (
@@ -164,12 +164,6 @@ class InputDocument(BaseModel):
164
164
  backend: Type[AbstractDocumentBackend],
165
165
  path_or_stream: Union[BytesIO, Path],
166
166
  ) -> None:
167
- if backend is None:
168
- raise RuntimeError(
169
- f"No backend configuration provided for file {self.file.name} with format {self.format}. "
170
- f"Please check your format configuration on DocumentConverter."
171
- )
172
-
173
167
  self._backend = backend(self, path_or_stream=path_or_stream)
174
168
  if not self._backend.is_valid():
175
169
  self.valid = False
@@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
450
444
  return ds_doc
451
445
 
452
446
 
447
+ class _DummyBackend(AbstractDocumentBackend):
448
+ def __init__(self, *args, **kwargs):
449
+ super().__init__(*args, **kwargs)
450
+
451
+ def is_valid(self) -> bool:
452
+ return False
453
+
454
+ @classmethod
455
+ def supported_formats(cls) -> Set[InputFormat]:
456
+ return set()
457
+
458
+ @classmethod
459
+ def supports_pagination(cls) -> bool:
460
+ return False
461
+
462
+ def unload(self):
463
+ return super().unload()
464
+
465
+
453
466
  class _DocumentConversionInput(BaseModel):
454
467
 
455
468
  path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
@@ -461,11 +474,12 @@ class _DocumentConversionInput(BaseModel):
461
474
  for item in self.path_or_stream_iterator:
462
475
  obj = resolve_source_to_stream(item) if isinstance(item, str) else item
463
476
  format = self._guess_format(obj)
477
+ backend: Type[AbstractDocumentBackend]
464
478
  if format not in format_options.keys():
465
- _log.info(
466
- f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
479
+ _log.error(
480
+ f"Input document {obj.name} does not match any allowed format."
467
481
  )
468
- continue
482
+ backend = _DummyBackend
469
483
  else:
470
484
  backend = format_options[format].backend
471
485
 
@@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
15
15
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
16
16
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
17
17
  from docling.backend.msword_backend import MsWordDocumentBackend
18
- from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
18
+ from docling.datamodel.base_models import (
19
+ ConversionStatus,
20
+ DoclingComponentType,
21
+ DocumentStream,
22
+ ErrorItem,
23
+ InputFormat,
24
+ )
19
25
  from docling.datamodel.document import (
20
26
  ConversionResult,
21
27
  InputDocument,
@@ -23,6 +29,7 @@ from docling.datamodel.document import (
23
29
  )
24
30
  from docling.datamodel.pipeline_options import PipelineOptions
25
31
  from docling.datamodel.settings import DocumentLimits, settings
32
+ from docling.exceptions import ConversionError
26
33
  from docling.pipeline.base_pipeline import BasePipeline
27
34
  from docling.pipeline.simple_pipeline import SimplePipeline
28
35
  from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
85
92
  backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
86
93
 
87
94
 
88
- _format_to_default_options = {
89
- InputFormat.XLSX: FormatOption(
90
- pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
91
- ),
92
- InputFormat.DOCX: FormatOption(
93
- pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
94
- ),
95
- InputFormat.PPTX: FormatOption(
96
- pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
97
- ),
98
- InputFormat.MD: FormatOption(
99
- pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
100
- ),
101
- InputFormat.ASCIIDOC: FormatOption(
102
- pipeline_cls=SimplePipeline, backend=AsciiDocBackend
103
- ),
104
- InputFormat.HTML: FormatOption(
105
- pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
106
- ),
107
- InputFormat.IMAGE: FormatOption(
108
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
109
- ),
110
- InputFormat.PDF: FormatOption(
111
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
112
- ),
113
- }
95
+ def _get_default_option(format: InputFormat) -> FormatOption:
96
+ format_to_default_options = {
97
+ InputFormat.XLSX: FormatOption(
98
+ pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
99
+ ),
100
+ InputFormat.DOCX: FormatOption(
101
+ pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
102
+ ),
103
+ InputFormat.PPTX: FormatOption(
104
+ pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
105
+ ),
106
+ InputFormat.MD: FormatOption(
107
+ pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
108
+ ),
109
+ InputFormat.ASCIIDOC: FormatOption(
110
+ pipeline_cls=SimplePipeline, backend=AsciiDocBackend
111
+ ),
112
+ InputFormat.HTML: FormatOption(
113
+ pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
114
+ ),
115
+ InputFormat.IMAGE: FormatOption(
116
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
117
+ ),
118
+ InputFormat.PDF: FormatOption(
119
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
120
+ ),
121
+ }
122
+ if (options := format_to_default_options.get(format)) is not None:
123
+ return options
124
+ else:
125
+ raise RuntimeError(f"No default options configured for {format}")
114
126
 
115
127
 
116
128
  class DocumentConverter:
@@ -121,36 +133,26 @@ class DocumentConverter:
121
133
  allowed_formats: Optional[List[InputFormat]] = None,
122
134
  format_options: Optional[Dict[InputFormat, FormatOption]] = None,
123
135
  ):
124
- self.allowed_formats = allowed_formats
125
- self.format_to_options = format_options
126
-
127
- if self.allowed_formats is None:
128
- # if self.format_to_options is not None:
129
- # self.allowed_formats = self.format_to_options.keys()
130
- # else:
131
- self.allowed_formats = [e for e in InputFormat] # all formats
132
-
133
- if self.format_to_options is None:
134
- self.format_to_options = _format_to_default_options
135
- else:
136
- for f in self.allowed_formats:
137
- if f not in self.format_to_options.keys():
138
- _log.debug(f"Requested format {f} will use default options.")
139
- self.format_to_options[f] = _format_to_default_options[f]
140
-
141
- remove_keys = []
142
- for f in self.format_to_options.keys():
143
- if f not in self.allowed_formats:
144
- remove_keys.append(f)
145
-
146
- for f in remove_keys:
147
- self.format_to_options.pop(f)
148
-
136
+ self.allowed_formats = (
137
+ allowed_formats if allowed_formats is not None else [e for e in InputFormat]
138
+ )
139
+ self.format_to_options = {
140
+ format: (
141
+ _get_default_option(format=format)
142
+ if (custom_option := (format_options or {}).get(format)) is None
143
+ else custom_option
144
+ )
145
+ for format in self.allowed_formats
146
+ }
149
147
  self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
150
148
 
151
149
  def initialize_pipeline(self, format: InputFormat):
152
150
  """Initialize the conversion pipeline for the selected format."""
153
- self._get_pipeline(doc_format=format)
151
+ pipeline = self._get_pipeline(doc_format=format)
152
+ if pipeline is None:
153
+ raise ConversionError(
154
+ f"No pipeline could be initialized for format {format}"
155
+ )
154
156
 
155
157
  @validate_call(config=ConfigDict(strict=True))
156
158
  def convert(
@@ -186,22 +188,28 @@ class DocumentConverter:
186
188
  limits=limits,
187
189
  )
188
190
  conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
191
+
192
+ had_result = False
189
193
  for conv_res in conv_res_iter:
194
+ had_result = True
190
195
  if raises_on_error and conv_res.status not in {
191
196
  ConversionStatus.SUCCESS,
192
197
  ConversionStatus.PARTIAL_SUCCESS,
193
198
  }:
194
- raise RuntimeError(
199
+ raise ConversionError(
195
200
  f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
196
201
  )
197
202
  else:
198
203
  yield conv_res
199
204
 
205
+ if not had_result and raises_on_error:
206
+ raise ConversionError(
207
+ f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
208
+ )
209
+
200
210
  def _convert(
201
211
  self, conv_input: _DocumentConversionInput, raises_on_error: bool
202
212
  ) -> Iterator[ConversionResult]:
203
- assert self.format_to_options is not None
204
-
205
213
  start_time = time.monotonic()
206
214
 
207
215
  for input_batch in chunkify(
@@ -223,27 +231,22 @@ class DocumentConverter:
223
231
  ):
224
232
  elapsed = time.monotonic() - start_time
225
233
  start_time = time.monotonic()
226
-
227
- if item is not None:
228
- _log.info(
229
- f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
230
- )
231
- yield item
232
- else:
233
- _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
234
+ _log.info(
235
+ f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
236
+ )
237
+ yield item
234
238
 
235
239
  def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
236
- assert self.format_to_options is not None
237
-
238
240
  fopt = self.format_to_options.get(doc_format)
239
241
 
240
242
  if fopt is None:
241
- raise RuntimeError(f"Could not get pipeline for {doc_format}")
243
+ return None
242
244
  else:
243
245
  pipeline_class = fopt.pipeline_cls
244
246
  pipeline_options = fopt.pipeline_options
245
247
 
246
- assert pipeline_options is not None
248
+ if pipeline_options is None:
249
+ return None
247
250
  # TODO this will ignore if different options have been defined for the same pipeline class.
248
251
  if (
249
252
  pipeline_class not in self.initialized_pipelines
@@ -257,11 +260,26 @@ class DocumentConverter:
257
260
 
258
261
  def _process_document(
259
262
  self, in_doc: InputDocument, raises_on_error: bool
260
- ) -> Optional[ConversionResult]:
261
- assert self.allowed_formats is not None
262
- assert in_doc.format in self.allowed_formats
263
+ ) -> ConversionResult:
263
264
 
264
- conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
265
+ valid = (
266
+ self.allowed_formats is not None and in_doc.format in self.allowed_formats
267
+ )
268
+ if valid:
269
+ conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
270
+ else:
271
+ error_message = f"File format not allowed: {in_doc.file}"
272
+ if raises_on_error:
273
+ raise ConversionError(error_message)
274
+ else:
275
+ error_item = ErrorItem(
276
+ component_type=DoclingComponentType.USER_INPUT,
277
+ module_name="",
278
+ error_message=error_message,
279
+ )
280
+ conv_res = ConversionResult(
281
+ input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
282
+ )
265
283
 
266
284
  return conv_res
267
285
 
@@ -270,26 +288,28 @@ class DocumentConverter:
270
288
  ) -> ConversionResult:
271
289
  if in_doc.valid:
272
290
  pipeline = self._get_pipeline(in_doc.format)
273
- if pipeline is None: # Can't find a default pipeline. Should this raise?
291
+ if pipeline is not None:
292
+ conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
293
+ else:
274
294
  if raises_on_error:
275
- raise RuntimeError(
295
+ raise ConversionError(
276
296
  f"No pipeline could be initialized for {in_doc.file}."
277
297
  )
278
298
  else:
279
- conv_res = ConversionResult(input=in_doc)
280
- conv_res.status = ConversionStatus.FAILURE
281
- return conv_res
282
-
283
- conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
284
-
299
+ conv_res = ConversionResult(
300
+ input=in_doc,
301
+ status=ConversionStatus.FAILURE,
302
+ )
285
303
  else:
286
304
  if raises_on_error:
287
- raise RuntimeError(f"Input document {in_doc.file} is not valid.")
305
+ raise ConversionError(f"Input document {in_doc.file} is not valid.")
288
306
 
289
307
  else:
290
308
  # invalid doc or not of desired format
291
- conv_res = ConversionResult(input=in_doc)
292
- conv_res.status = ConversionStatus.FAILURE
309
+ conv_res = ConversionResult(
310
+ input=in_doc,
311
+ status=ConversionStatus.FAILURE,
312
+ )
293
313
  # TODO add error log why it failed.
294
314
 
295
315
  return conv_res
@@ -0,0 +1,6 @@
1
+ class BaseError(RuntimeError):
2
+ pass
3
+
4
+
5
+ class ConversionError(BaseError):
6
+ pass
@@ -0,0 +1 @@
1
+
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.8.2" # DO NOT EDIT, updated automatically
3
+ version = "2.9.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -25,8 +25,8 @@ packages = [{include = "docling"}]
25
25
  # actual dependencies:
26
26
  ######################
27
27
  python = "^3.9"
28
- pydantic = ">=2.0.0,<2.10"
29
- docling-core = "^2.6.1"
28
+ docling-core = { version = "^2.8.0", extras = ["chunking"] }
29
+ pydantic = "^2.0.0"
30
30
  docling-ibm-models = "^2.0.6"
31
31
  deepsearch-glm = "^0.26.1"
32
32
  filetype = "^1.2.0"
File without changes
File without changes
File without changes
File without changes
File without changes