docling 2.55.1__tar.gz → 2.56.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (148) hide show
  1. {docling-2.55.1 → docling-2.56.1}/PKG-INFO +5 -3
  2. {docling-2.55.1 → docling-2.56.1}/docling/backend/html_backend.py +36 -15
  3. {docling-2.55.1 → docling-2.56.1}/docling/backend/msexcel_backend.py +13 -9
  4. {docling-2.55.1 → docling-2.56.1}/docling/cli/main.py +33 -8
  5. {docling-2.55.1 → docling-2.56.1}/docling/cli/models.py +3 -1
  6. {docling-2.55.1 → docling-2.56.1}/docling/datamodel/pipeline_options.py +15 -1
  7. docling-2.56.1/docling/models/auto_ocr_model.py +132 -0
  8. {docling-2.55.1 → docling-2.56.1}/docling/models/base_model.py +2 -2
  9. {docling-2.55.1 → docling-2.56.1}/docling/models/plugins/defaults.py +2 -0
  10. {docling-2.55.1 → docling-2.56.1}/docling/models/rapid_ocr_model.py +126 -5
  11. {docling-2.55.1 → docling-2.56.1}/docling/models/tesseract_ocr_cli_model.py +4 -0
  12. {docling-2.55.1 → docling-2.56.1}/docling/models/tesseract_ocr_model.py +15 -5
  13. {docling-2.55.1 → docling-2.56.1}/docling/pipeline/asr_pipeline.py +53 -6
  14. {docling-2.55.1 → docling-2.56.1}/docling/utils/model_downloader.py +13 -1
  15. {docling-2.55.1 → docling-2.56.1}/docling.egg-info/PKG-INFO +5 -3
  16. {docling-2.55.1 → docling-2.56.1}/docling.egg-info/SOURCES.txt +1 -0
  17. {docling-2.55.1 → docling-2.56.1}/docling.egg-info/requires.txt +9 -2
  18. {docling-2.55.1 → docling-2.56.1}/pyproject.toml +6 -3
  19. {docling-2.55.1 → docling-2.56.1}/tests/test_asr_pipeline.py +26 -0
  20. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_msexcel.py +17 -2
  21. {docling-2.55.1 → docling-2.56.1}/LICENSE +0 -0
  22. {docling-2.55.1 → docling-2.56.1}/README.md +0 -0
  23. {docling-2.55.1 → docling-2.56.1}/docling/__init__.py +0 -0
  24. {docling-2.55.1 → docling-2.56.1}/docling/backend/__init__.py +0 -0
  25. {docling-2.55.1 → docling-2.56.1}/docling/backend/abstract_backend.py +0 -0
  26. {docling-2.55.1 → docling-2.56.1}/docling/backend/asciidoc_backend.py +0 -0
  27. {docling-2.55.1 → docling-2.56.1}/docling/backend/csv_backend.py +0 -0
  28. {docling-2.55.1 → docling-2.56.1}/docling/backend/docling_parse_backend.py +0 -0
  29. {docling-2.55.1 → docling-2.56.1}/docling/backend/docling_parse_v2_backend.py +0 -0
  30. {docling-2.55.1 → docling-2.56.1}/docling/backend/docling_parse_v4_backend.py +0 -0
  31. {docling-2.55.1 → docling-2.56.1}/docling/backend/docx/__init__.py +0 -0
  32. {docling-2.55.1 → docling-2.56.1}/docling/backend/docx/latex/__init__.py +0 -0
  33. {docling-2.55.1 → docling-2.56.1}/docling/backend/docx/latex/latex_dict.py +0 -0
  34. {docling-2.55.1 → docling-2.56.1}/docling/backend/docx/latex/omml.py +0 -0
  35. {docling-2.55.1 → docling-2.56.1}/docling/backend/json/__init__.py +0 -0
  36. {docling-2.55.1 → docling-2.56.1}/docling/backend/json/docling_json_backend.py +0 -0
  37. {docling-2.55.1 → docling-2.56.1}/docling/backend/md_backend.py +0 -0
  38. {docling-2.55.1 → docling-2.56.1}/docling/backend/mets_gbs_backend.py +0 -0
  39. {docling-2.55.1 → docling-2.56.1}/docling/backend/mspowerpoint_backend.py +0 -0
  40. {docling-2.55.1 → docling-2.56.1}/docling/backend/msword_backend.py +0 -0
  41. {docling-2.55.1 → docling-2.56.1}/docling/backend/noop_backend.py +0 -0
  42. {docling-2.55.1 → docling-2.56.1}/docling/backend/pdf_backend.py +0 -0
  43. {docling-2.55.1 → docling-2.56.1}/docling/backend/pypdfium2_backend.py +0 -0
  44. {docling-2.55.1 → docling-2.56.1}/docling/backend/webvtt_backend.py +0 -0
  45. {docling-2.55.1 → docling-2.56.1}/docling/backend/xml/__init__.py +0 -0
  46. {docling-2.55.1 → docling-2.56.1}/docling/backend/xml/jats_backend.py +0 -0
  47. {docling-2.55.1 → docling-2.56.1}/docling/backend/xml/uspto_backend.py +0 -0
  48. {docling-2.55.1 → docling-2.56.1}/docling/chunking/__init__.py +0 -0
  49. {docling-2.55.1 → docling-2.56.1}/docling/cli/__init__.py +0 -0
  50. {docling-2.55.1 → docling-2.56.1}/docling/cli/tools.py +0 -0
  51. {docling-2.55.1 → docling-2.56.1}/docling/datamodel/__init__.py +0 -0
  52. {docling-2.55.1 → docling-2.56.1}/docling/datamodel/accelerator_options.py +0 -0
  53. {docling-2.55.1 → docling-2.56.1}/docling/datamodel/asr_model_specs.py +0 -0
  54. {docling-2.55.1 → docling-2.56.1}/docling/datamodel/base_models.py +0 -0
  55. {docling-2.55.1 → docling-2.56.1}/docling/datamodel/document.py +0 -0
  56. {docling-2.55.1 → docling-2.56.1}/docling/datamodel/extraction.py +0 -0
  57. {docling-2.55.1 → docling-2.56.1}/docling/datamodel/layout_model_specs.py +0 -0
  58. {docling-2.55.1 → docling-2.56.1}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  59. {docling-2.55.1 → docling-2.56.1}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  60. {docling-2.55.1 → docling-2.56.1}/docling/datamodel/settings.py +0 -0
  61. {docling-2.55.1 → docling-2.56.1}/docling/datamodel/vlm_model_specs.py +0 -0
  62. {docling-2.55.1 → docling-2.56.1}/docling/document_converter.py +0 -0
  63. {docling-2.55.1 → docling-2.56.1}/docling/document_extractor.py +0 -0
  64. {docling-2.55.1 → docling-2.56.1}/docling/exceptions.py +0 -0
  65. {docling-2.55.1 → docling-2.56.1}/docling/models/__init__.py +0 -0
  66. {docling-2.55.1 → docling-2.56.1}/docling/models/api_vlm_model.py +0 -0
  67. {docling-2.55.1 → docling-2.56.1}/docling/models/base_ocr_model.py +0 -0
  68. {docling-2.55.1 → docling-2.56.1}/docling/models/code_formula_model.py +0 -0
  69. {docling-2.55.1 → docling-2.56.1}/docling/models/document_picture_classifier.py +0 -0
  70. {docling-2.55.1 → docling-2.56.1}/docling/models/easyocr_model.py +0 -0
  71. {docling-2.55.1 → docling-2.56.1}/docling/models/factories/__init__.py +0 -0
  72. {docling-2.55.1 → docling-2.56.1}/docling/models/factories/base_factory.py +0 -0
  73. {docling-2.55.1 → docling-2.56.1}/docling/models/factories/ocr_factory.py +0 -0
  74. {docling-2.55.1 → docling-2.56.1}/docling/models/factories/picture_description_factory.py +0 -0
  75. {docling-2.55.1 → docling-2.56.1}/docling/models/layout_model.py +0 -0
  76. {docling-2.55.1 → docling-2.56.1}/docling/models/ocr_mac_model.py +0 -0
  77. {docling-2.55.1 → docling-2.56.1}/docling/models/page_assemble_model.py +0 -0
  78. {docling-2.55.1 → docling-2.56.1}/docling/models/page_preprocessing_model.py +0 -0
  79. {docling-2.55.1 → docling-2.56.1}/docling/models/picture_description_api_model.py +0 -0
  80. {docling-2.55.1 → docling-2.56.1}/docling/models/picture_description_base_model.py +0 -0
  81. {docling-2.55.1 → docling-2.56.1}/docling/models/picture_description_vlm_model.py +0 -0
  82. {docling-2.55.1 → docling-2.56.1}/docling/models/plugins/__init__.py +0 -0
  83. {docling-2.55.1 → docling-2.56.1}/docling/models/readingorder_model.py +0 -0
  84. {docling-2.55.1 → docling-2.56.1}/docling/models/table_structure_model.py +0 -0
  85. {docling-2.55.1 → docling-2.56.1}/docling/models/utils/__init__.py +0 -0
  86. {docling-2.55.1 → docling-2.56.1}/docling/models/utils/generation_utils.py +0 -0
  87. {docling-2.55.1 → docling-2.56.1}/docling/models/utils/hf_model_download.py +0 -0
  88. {docling-2.55.1 → docling-2.56.1}/docling/models/vlm_models_inline/__init__.py +0 -0
  89. {docling-2.55.1 → docling-2.56.1}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  90. {docling-2.55.1 → docling-2.56.1}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  91. {docling-2.55.1 → docling-2.56.1}/docling/models/vlm_models_inline/nuextract_transformers_model.py +0 -0
  92. {docling-2.55.1 → docling-2.56.1}/docling/models/vlm_models_inline/vllm_model.py +0 -0
  93. {docling-2.55.1 → docling-2.56.1}/docling/pipeline/__init__.py +0 -0
  94. {docling-2.55.1 → docling-2.56.1}/docling/pipeline/base_extraction_pipeline.py +0 -0
  95. {docling-2.55.1 → docling-2.56.1}/docling/pipeline/base_pipeline.py +0 -0
  96. {docling-2.55.1 → docling-2.56.1}/docling/pipeline/extraction_vlm_pipeline.py +0 -0
  97. {docling-2.55.1 → docling-2.56.1}/docling/pipeline/simple_pipeline.py +0 -0
  98. {docling-2.55.1 → docling-2.56.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  99. {docling-2.55.1 → docling-2.56.1}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
  100. {docling-2.55.1 → docling-2.56.1}/docling/pipeline/vlm_pipeline.py +0 -0
  101. {docling-2.55.1 → docling-2.56.1}/docling/py.typed +0 -0
  102. {docling-2.55.1 → docling-2.56.1}/docling/utils/__init__.py +0 -0
  103. {docling-2.55.1 → docling-2.56.1}/docling/utils/accelerator_utils.py +0 -0
  104. {docling-2.55.1 → docling-2.56.1}/docling/utils/api_image_request.py +0 -0
  105. {docling-2.55.1 → docling-2.56.1}/docling/utils/export.py +0 -0
  106. {docling-2.55.1 → docling-2.56.1}/docling/utils/glm_utils.py +0 -0
  107. {docling-2.55.1 → docling-2.56.1}/docling/utils/layout_postprocessor.py +0 -0
  108. {docling-2.55.1 → docling-2.56.1}/docling/utils/locks.py +0 -0
  109. {docling-2.55.1 → docling-2.56.1}/docling/utils/ocr_utils.py +0 -0
  110. {docling-2.55.1 → docling-2.56.1}/docling/utils/orientation.py +0 -0
  111. {docling-2.55.1 → docling-2.56.1}/docling/utils/profiling.py +0 -0
  112. {docling-2.55.1 → docling-2.56.1}/docling/utils/utils.py +0 -0
  113. {docling-2.55.1 → docling-2.56.1}/docling/utils/visualization.py +0 -0
  114. {docling-2.55.1 → docling-2.56.1}/docling.egg-info/dependency_links.txt +0 -0
  115. {docling-2.55.1 → docling-2.56.1}/docling.egg-info/entry_points.txt +0 -0
  116. {docling-2.55.1 → docling-2.56.1}/docling.egg-info/top_level.txt +0 -0
  117. {docling-2.55.1 → docling-2.56.1}/setup.cfg +0 -0
  118. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_asciidoc.py +0 -0
  119. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_csv.py +0 -0
  120. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_docling_json.py +0 -0
  121. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_docling_parse.py +0 -0
  122. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_docling_parse_v2.py +0 -0
  123. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_docling_parse_v4.py +0 -0
  124. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_html.py +0 -0
  125. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_jats.py +0 -0
  126. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_markdown.py +0 -0
  127. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_mets_gbs.py +0 -0
  128. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_msword.py +0 -0
  129. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_patent_uspto.py +0 -0
  130. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_pdfium.py +0 -0
  131. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_pptx.py +0 -0
  132. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_vtt.py +0 -0
  133. {docling-2.55.1 → docling-2.56.1}/tests/test_backend_webp.py +0 -0
  134. {docling-2.55.1 → docling-2.56.1}/tests/test_cli.py +0 -0
  135. {docling-2.55.1 → docling-2.56.1}/tests/test_code_formula.py +0 -0
  136. {docling-2.55.1 → docling-2.56.1}/tests/test_data_gen_flag.py +0 -0
  137. {docling-2.55.1 → docling-2.56.1}/tests/test_document_picture_classifier.py +0 -0
  138. {docling-2.55.1 → docling-2.56.1}/tests/test_e2e_conversion.py +0 -0
  139. {docling-2.55.1 → docling-2.56.1}/tests/test_e2e_ocr_conversion.py +0 -0
  140. {docling-2.55.1 → docling-2.56.1}/tests/test_extraction.py +0 -0
  141. {docling-2.55.1 → docling-2.56.1}/tests/test_input_doc.py +0 -0
  142. {docling-2.55.1 → docling-2.56.1}/tests/test_interfaces.py +0 -0
  143. {docling-2.55.1 → docling-2.56.1}/tests/test_invalid_input.py +0 -0
  144. {docling-2.55.1 → docling-2.56.1}/tests/test_legacy_format_transform.py +0 -0
  145. {docling-2.55.1 → docling-2.56.1}/tests/test_ocr_utils.py +0 -0
  146. {docling-2.55.1 → docling-2.56.1}/tests/test_options.py +0 -0
  147. {docling-2.55.1 → docling-2.56.1}/tests/test_settings_load.py +0 -0
  148. {docling-2.55.1 → docling-2.56.1}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.55.1
3
+ Version: 2.56.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -34,7 +34,8 @@ Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
34
34
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
35
35
  Requires-Dist: huggingface_hub<1,>=0.23
36
36
  Requires-Dist: requests<3.0.0,>=2.32.2
37
- Requires-Dist: easyocr<2.0,>=1.7
37
+ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin"
38
+ Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14"
38
39
  Requires-Dist: certifi>=2024.7.4
39
40
  Requires-Dist: rtree<2.0.0,>=1.3.0
40
41
  Requires-Dist: typer<0.20.0,>=0.12.5
@@ -52,6 +53,8 @@ Requires-Dist: pylatexenc<3.0,>=2.10
52
53
  Requires-Dist: scipy<2.0.0,>=1.6.0
53
54
  Requires-Dist: accelerate<2,>=1.0.0
54
55
  Requires-Dist: polyfactory>=2.22.2
56
+ Provides-Extra: easyocr
57
+ Requires-Dist: easyocr<2.0,>=1.7; extra == "easyocr"
55
58
  Provides-Extra: tesserocr
56
59
  Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
57
60
  Provides-Extra: ocrmac
@@ -65,7 +68,6 @@ Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
65
68
  Provides-Extra: rapidocr
66
69
  Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
67
70
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
68
- Requires-Dist: modelscope>=1.29.0; extra == "rapidocr"
69
71
  Provides-Extra: asr
70
72
  Requires-Dist: openai-whisper>=20250625; extra == "asr"
71
73
  Dynamic: license-file
@@ -272,9 +272,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
272
272
  for br in content("br"):
273
273
  br.replace_with(NavigableString("\n"))
274
274
  # set default content layer
275
- headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
275
+
276
+ # Furniture before the first heading rule, except for headers in tables
277
+ header = None
278
+ # Find all headers first
279
+ all_headers = content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
280
+ # Keep only those that do NOT have a <table> in a parent chain
281
+ clean_headers = [h for h in all_headers if not h.find_parent("table")]
282
+ # Pick the first header from the remaining
283
+ if len(clean_headers):
284
+ header = clean_headers[0]
285
+ # Set starting content layer
276
286
  self.content_layer = (
277
- ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
287
+ ContentLayer.BODY if header is None else ContentLayer.FURNITURE
278
288
  )
279
289
  # reset context
280
290
  self.ctx = _Context()
@@ -309,9 +319,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
309
319
  group_name: str,
310
320
  doc: DoclingDocument,
311
321
  docling_table: TableItem,
312
- ) -> tuple[bool, RefItem]:
322
+ ) -> tuple[bool, Union[RefItem, None]]:
313
323
  rich_table_cell = False
314
- ref_for_rich_cell = provs_in_cell[0]
324
+ ref_for_rich_cell = None
325
+ if len(provs_in_cell) > 0:
326
+ ref_for_rich_cell = provs_in_cell[0]
315
327
  if len(provs_in_cell) > 1:
316
328
  # Cell has multiple elements, we need to group them
317
329
  rich_table_cell = True
@@ -324,7 +336,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
324
336
  if isinstance(pr_item, TextItem):
325
337
  # Cell has only one element and it's just a text
326
338
  rich_table_cell = False
327
- doc.delete_items(node_items=[pr_item])
339
+ try:
340
+ doc.delete_items(node_items=[pr_item])
341
+ except Exception as e:
342
+ _log.error(f"Error while making rich table: {e}.")
328
343
  else:
329
344
  rich_table_cell = True
330
345
  ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
@@ -391,17 +406,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
391
406
 
392
407
  provs_in_cell: list[RefItem] = []
393
408
  # Parse table cell sub-tree for Rich Cells content:
409
+ table_level = self.level
394
410
  provs_in_cell = self._walk(html_cell, doc)
411
+ # After walking sub-tree in cell, restore previously set level
412
+ self.level = table_level
395
413
 
396
414
  rich_table_cell = False
397
415
  ref_for_rich_cell = None
398
- if len(provs_in_cell) > 0:
399
- group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
400
- rich_table_cell, ref_for_rich_cell = (
401
- HTMLDocumentBackend.process_rich_table_cells(
402
- provs_in_cell, group_name, doc, docling_table
403
- )
416
+ group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
417
+ rich_table_cell, ref_for_rich_cell = (
418
+ HTMLDocumentBackend.process_rich_table_cells(
419
+ provs_in_cell, group_name, doc, docling_table
404
420
  )
421
+ )
405
422
 
406
423
  # Extracting text
407
424
  text = self.get_text(html_cell).strip()
@@ -774,13 +791,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
774
791
  for key in self.parents.keys():
775
792
  self.parents[key] = None
776
793
  self.level = 0
777
- docling_title = self.parents[self.level + 1] = doc.add_title(
794
+ self.parents[self.level + 1] = doc.add_title(
778
795
  text_clean,
779
796
  content_layer=self.content_layer,
780
797
  formatting=annotated_text.formatting,
781
798
  hyperlink=annotated_text.hyperlink,
782
799
  )
783
- added_ref = [docling_title.get_ref()]
800
+ p1 = self.parents[self.level + 1]
801
+ if p1 is not None:
802
+ added_ref = [p1.get_ref()]
784
803
  # the other levels need to be lowered by 1 if a title was set
785
804
  else:
786
805
  level -= 1
@@ -802,7 +821,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
802
821
  _log.debug(f"Remove the tail of level {key}")
803
822
  self.parents[key] = None
804
823
  self.level = level
805
- docling_heading = self.parents[self.level + 1] = doc.add_heading(
824
+ self.parents[self.level + 1] = doc.add_heading(
806
825
  parent=self.parents[self.level],
807
826
  text=text_clean,
808
827
  orig=annotated_text.text,
@@ -811,7 +830,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
811
830
  formatting=annotated_text.formatting,
812
831
  hyperlink=annotated_text.hyperlink,
813
832
  )
814
- added_ref = [docling_heading.get_ref()]
833
+ p2 = self.parents[self.level + 1]
834
+ if p2 is not None:
835
+ added_ref = [p2.get_ref()]
815
836
  self.level += 1
816
837
  for img_tag in tag("img"):
817
838
  if isinstance(img_tag, Tag):
@@ -18,6 +18,7 @@ from docling_core.types.doc import (
18
18
  TableData,
19
19
  )
20
20
  from openpyxl import load_workbook
21
+ from openpyxl.chartsheet.chartsheet import Chartsheet
21
22
  from openpyxl.drawing.image import Image
22
23
  from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
23
24
  from openpyxl.worksheet.worksheet import Worksheet
@@ -186,18 +187,18 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
186
187
 
187
188
  if self.workbook is not None:
188
189
  # Iterate over all sheets
189
- for sheet_name in self.workbook.sheetnames:
190
- _log.info(f"Processing sheet: {sheet_name}")
190
+ for idx, name in enumerate(self.workbook.sheetnames):
191
+ _log.info(f"Processing sheet {idx}: {name}")
191
192
 
192
- sheet = self.workbook[sheet_name]
193
- page_no = self.workbook.index(sheet) + 1
193
+ sheet = self.workbook[name]
194
+ page_no = idx + 1
194
195
  # do not rely on sheet.max_column, sheet.max_row if there are images
195
196
  page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
196
197
 
197
198
  self.parents[0] = doc.add_group(
198
199
  parent=None,
199
200
  label=GroupLabel.SECTION,
200
- name=f"sheet: {sheet_name}",
201
+ name=f"sheet: {name}",
201
202
  content_layer=self._get_sheet_content_layer(sheet),
202
203
  )
203
204
  doc = self._convert_sheet(doc, sheet)
@@ -208,7 +209,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
208
209
 
209
210
  return doc
210
211
 
211
- def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
212
+ def _convert_sheet(
213
+ self, doc: DoclingDocument, sheet: Union[Worksheet, Chartsheet]
214
+ ) -> DoclingDocument:
212
215
  """Parse an Excel worksheet and attach its structure to a DoclingDocument
213
216
 
214
217
  Args:
@@ -218,10 +221,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
218
221
  Returns:
219
222
  The updated DoclingDocument.
220
223
  """
224
+ if isinstance(sheet, Worksheet):
225
+ doc = self._find_tables_in_sheet(doc, sheet)
226
+ doc = self._find_images_in_sheet(doc, sheet)
221
227
 
222
- doc = self._find_tables_in_sheet(doc, sheet)
223
-
224
- doc = self._find_images_in_sheet(doc, sheet)
228
+ # TODO: parse charts in sheet
225
229
 
226
230
  return doc
227
231
 
@@ -49,7 +49,7 @@ from docling.datamodel.document import ConversionResult
49
49
  from docling.datamodel.pipeline_options import (
50
50
  AsrPipelineOptions,
51
51
  ConvertPipelineOptions,
52
- EasyOcrOptions,
52
+ OcrAutoOptions,
53
53
  OcrOptions,
54
54
  PaginatedPipelineOptions,
55
55
  PdfBackend,
@@ -57,6 +57,8 @@ from docling.datamodel.pipeline_options import (
57
57
  PipelineOptions,
58
58
  ProcessingPipeline,
59
59
  TableFormerMode,
60
+ TesseractCliOcrOptions,
61
+ TesseractOcrOptions,
60
62
  VlmPipelineOptions,
61
63
  )
62
64
  from docling.datamodel.settings import settings
@@ -372,7 +374,7 @@ def convert( # noqa: C901
372
374
  f"Use the option --show-external-plugins to see the options allowed with external plugins."
373
375
  ),
374
376
  ),
375
- ] = EasyOcrOptions.kind,
377
+ ] = OcrAutoOptions.kind,
376
378
  ocr_lang: Annotated[
377
379
  Optional[str],
378
380
  typer.Option(
@@ -380,6 +382,13 @@ def convert( # noqa: C901
380
382
  help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
381
383
  ),
382
384
  ] = None,
385
+ psm: Annotated[
386
+ Optional[int],
387
+ typer.Option(
388
+ ...,
389
+ help="Page Segmentation Mode for the OCR engine (0-13).",
390
+ ),
391
+ ] = None,
383
392
  pdf_backend: Annotated[
384
393
  PdfBackend, typer.Option(..., help="The PDF backend to use.")
385
394
  ] = PdfBackend.DLPARSE_V2,
@@ -547,13 +556,25 @@ def convert( # noqa: C901
547
556
  if local_path.exists() and local_path.is_dir():
548
557
  for fmt in from_formats:
549
558
  for ext in FormatToExtensions[fmt]:
550
- input_doc_paths.extend(
551
- list(local_path.glob(f"**/*.{ext}"))
552
- )
553
- input_doc_paths.extend(
554
- list(local_path.glob(f"**/*.{ext.upper()}"))
555
- )
559
+ for path in local_path.glob(f"**/*.{ext}"):
560
+ if path.name.startswith("~$") and ext == "docx":
561
+ _log.info(
562
+ f"Ignoring temporary Word file: {path}"
563
+ )
564
+ continue
565
+ input_doc_paths.append(path)
566
+
567
+ for path in local_path.glob(f"**/*.{ext.upper()}"):
568
+ if path.name.startswith("~$") and ext == "docx":
569
+ _log.info(
570
+ f"Ignoring temporary Word file: {path}"
571
+ )
572
+ continue
573
+ input_doc_paths.append(path)
556
574
  elif local_path.exists():
575
+ if not local_path.name.startswith("~$") and ext == "docx":
576
+ _log.info(f"Ignoring temporary Word file: {path}")
577
+ continue
557
578
  input_doc_paths.append(local_path)
558
579
  else:
559
580
  err_console.print(
@@ -584,6 +605,10 @@ def convert( # noqa: C901
584
605
  ocr_lang_list = _split_list(ocr_lang)
585
606
  if ocr_lang_list is not None:
586
607
  ocr_options.lang = ocr_lang_list
608
+ if psm is not None and isinstance(
609
+ ocr_options, (TesseractOcrOptions, TesseractCliOcrOptions)
610
+ ):
611
+ ocr_options.psm = psm
587
612
 
588
613
  accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
589
614
  # pipeline_options: PaginatedPipelineOptions
@@ -38,6 +38,7 @@ class _AvailableModels(str, Enum):
38
38
  SMOLDOCLING = "smoldocling"
39
39
  SMOLDOCLING_MLX = "smoldocling_mlx"
40
40
  GRANITE_VISION = "granite_vision"
41
+ RAPIDOCR = "rapidocr"
41
42
  EASYOCR = "easyocr"
42
43
 
43
44
 
@@ -46,7 +47,7 @@ _default_models = [
46
47
  _AvailableModels.TABLEFORMER,
47
48
  _AvailableModels.CODE_FORMULA,
48
49
  _AvailableModels.PICTURE_CLASSIFIER,
49
- _AvailableModels.EASYOCR,
50
+ _AvailableModels.RAPIDOCR,
50
51
  ]
51
52
 
52
53
 
@@ -115,6 +116,7 @@ def download(
115
116
  with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
116
117
  with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
117
118
  with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
119
+ with_rapidocr=_AvailableModels.RAPIDOCR in to_download,
118
120
  with_easyocr=_AvailableModels.EASYOCR in to_download,
119
121
  )
120
122
 
@@ -81,6 +81,13 @@ class OcrOptions(BaseOptions):
81
81
  )
82
82
 
83
83
 
84
+ class OcrAutoOptions(OcrOptions):
85
+ """Options for pick OCR engine automatically."""
86
+
87
+ kind: ClassVar[Literal["auto"]] = "auto"
88
+ lang: List[str] = []
89
+
90
+
84
91
  class RapidOcrOptions(OcrOptions):
85
92
  """Options for the RapidOCR engine."""
86
93
 
@@ -154,6 +161,9 @@ class TesseractCliOcrOptions(OcrOptions):
154
161
  lang: List[str] = ["fra", "deu", "spa", "eng"]
155
162
  tesseract_cmd: str = "tesseract"
156
163
  path: Optional[str] = None
164
+ psm: Optional[int] = (
165
+ None # Page Segmentation Mode (0-13), defaults to tesseract's default
166
+ )
157
167
 
158
168
  model_config = ConfigDict(
159
169
  extra="forbid",
@@ -166,6 +176,9 @@ class TesseractOcrOptions(OcrOptions):
166
176
  kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
167
177
  lang: List[str] = ["fra", "deu", "spa", "eng"]
168
178
  path: Optional[str] = None
179
+ psm: Optional[int] = (
180
+ None # Page Segmentation Mode (0-13), defaults to tesseract's default
181
+ )
169
182
 
170
183
  model_config = ConfigDict(
171
184
  extra="forbid",
@@ -249,6 +262,7 @@ class PdfBackend(str, Enum):
249
262
  class OcrEngine(str, Enum):
250
263
  """Enum of valid OCR engines."""
251
264
 
265
+ AUTO = "auto"
252
266
  EASYOCR = "easyocr"
253
267
  TESSERACT_CLI = "tesseract_cli"
254
268
  TESSERACT = "tesseract"
@@ -330,7 +344,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
330
344
  # If True, text from backend will be used instead of generated text
331
345
 
332
346
  table_structure_options: TableStructureOptions = TableStructureOptions()
333
- ocr_options: OcrOptions = EasyOcrOptions()
347
+ ocr_options: OcrOptions = OcrAutoOptions()
334
348
  layout_options: LayoutOptions = LayoutOptions()
335
349
 
336
350
  images_scale: float = 1.0
@@ -0,0 +1,132 @@
1
+ import logging
2
+ import sys
3
+ from collections.abc import Iterable
4
+ from pathlib import Path
5
+ from typing import Optional, Type
6
+
7
+ from docling.datamodel.accelerator_options import AcceleratorOptions
8
+ from docling.datamodel.base_models import Page
9
+ from docling.datamodel.document import ConversionResult
10
+ from docling.datamodel.pipeline_options import (
11
+ EasyOcrOptions,
12
+ OcrAutoOptions,
13
+ OcrMacOptions,
14
+ OcrOptions,
15
+ RapidOcrOptions,
16
+ )
17
+ from docling.models.base_ocr_model import BaseOcrModel
18
+ from docling.models.easyocr_model import EasyOcrModel
19
+ from docling.models.ocr_mac_model import OcrMacModel
20
+ from docling.models.rapid_ocr_model import RapidOcrModel
21
+
22
+ _log = logging.getLogger(__name__)
23
+
24
+
25
+ class OcrAutoModel(BaseOcrModel):
26
+ def __init__(
27
+ self,
28
+ enabled: bool,
29
+ artifacts_path: Optional[Path],
30
+ options: OcrAutoOptions,
31
+ accelerator_options: AcceleratorOptions,
32
+ ):
33
+ super().__init__(
34
+ enabled=enabled,
35
+ artifacts_path=artifacts_path,
36
+ options=options,
37
+ accelerator_options=accelerator_options,
38
+ )
39
+ self.options: OcrAutoOptions
40
+
41
+ self._engine: Optional[BaseOcrModel] = None
42
+ if self.enabled:
43
+ if "darwin" == sys.platform:
44
+ try:
45
+ from ocrmac import ocrmac
46
+
47
+ self._engine = OcrMacModel(
48
+ enabled=self.enabled,
49
+ artifacts_path=artifacts_path,
50
+ options=OcrMacOptions(
51
+ bitmap_area_threshold=self.options.bitmap_area_threshold,
52
+ force_full_page_ocr=self.options.force_full_page_ocr,
53
+ ),
54
+ accelerator_options=accelerator_options,
55
+ )
56
+ _log.info("Auto OCR model selected ocrmac.")
57
+ except ImportError:
58
+ _log.info("ocrmac cannot be used because ocrmac is not installed.")
59
+
60
+ if self._engine is None:
61
+ try:
62
+ import onnxruntime
63
+ from rapidocr import EngineType, RapidOCR # type: ignore
64
+
65
+ self._engine = RapidOcrModel(
66
+ enabled=self.enabled,
67
+ artifacts_path=artifacts_path,
68
+ options=RapidOcrOptions(
69
+ backend="onnxruntime",
70
+ bitmap_area_threshold=self.options.bitmap_area_threshold,
71
+ force_full_page_ocr=self.options.force_full_page_ocr,
72
+ ),
73
+ accelerator_options=accelerator_options,
74
+ )
75
+ _log.info("Auto OCR model selected rapidocr with onnxruntime.")
76
+ except ImportError:
77
+ _log.info(
78
+ "rapidocr cannot be used because onnxruntime is not installed."
79
+ )
80
+
81
+ if self._engine is None:
82
+ try:
83
+ import easyocr
84
+
85
+ self._engine = EasyOcrModel(
86
+ enabled=self.enabled,
87
+ artifacts_path=artifacts_path,
88
+ options=EasyOcrOptions(
89
+ bitmap_area_threshold=self.options.bitmap_area_threshold,
90
+ force_full_page_ocr=self.options.force_full_page_ocr,
91
+ ),
92
+ accelerator_options=accelerator_options,
93
+ )
94
+ _log.info("Auto OCR model selected easyocr.")
95
+ except ImportError:
96
+ _log.info("easyocr cannot be used because it is not installed.")
97
+
98
+ if self._engine is None:
99
+ try:
100
+ import torch
101
+ from rapidocr import EngineType, RapidOCR # type: ignore
102
+
103
+ self._engine = RapidOcrModel(
104
+ enabled=self.enabled,
105
+ artifacts_path=artifacts_path,
106
+ options=RapidOcrOptions(
107
+ backend="torch",
108
+ bitmap_area_threshold=self.options.bitmap_area_threshold,
109
+ force_full_page_ocr=self.options.force_full_page_ocr,
110
+ ),
111
+ accelerator_options=accelerator_options,
112
+ )
113
+ _log.info("Auto OCR model selected rapidocr with torch.")
114
+ except ImportError:
115
+ _log.info(
116
+ "rapidocr cannot be used because rapidocr or torch is not installed."
117
+ )
118
+
119
+ if self._engine is None:
120
+ _log.warning("No OCR engine found. Please review the install details.")
121
+
122
+ def __call__(
123
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
124
+ ) -> Iterable[Page]:
125
+ if not self.enabled or self._engine is None:
126
+ yield from page_batch
127
+ return
128
+ yield from self._engine(conv_res, page_batch)
129
+
130
+ @classmethod
131
+ def get_options_type(cls) -> Type[OcrOptions]:
132
+ return OcrAutoOptions
@@ -173,11 +173,11 @@ class BaseItemAndImageEnrichmentModel(
173
173
  assert isinstance(element, DocItem)
174
174
 
175
175
  # Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
176
- if len(element.prov) == 0 and isinstance(element, PictureItem):
176
+ if isinstance(element, PictureItem):
177
177
  embedded_im = element.get_image(conv_res.document)
178
178
  if embedded_im is not None:
179
179
  return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
180
- else:
180
+ elif len(element.prov) == 0:
181
181
  return None
182
182
 
183
183
  # Crop the image form the page
@@ -1,4 +1,5 @@
1
1
  def ocr_engines():
2
+ from docling.models.auto_ocr_model import OcrAutoModel
2
3
  from docling.models.easyocr_model import EasyOcrModel
3
4
  from docling.models.ocr_mac_model import OcrMacModel
4
5
  from docling.models.rapid_ocr_model import RapidOcrModel
@@ -7,6 +8,7 @@ def ocr_engines():
7
8
 
8
9
  return {
9
10
  "ocr_engines": [
11
+ OcrAutoModel,
10
12
  EasyOcrModel,
11
13
  OcrMacModel,
12
14
  RapidOcrModel,