docling 2.55.1__tar.gz → 2.56.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- {docling-2.55.1 → docling-2.56.1}/PKG-INFO +5 -3
- {docling-2.55.1 → docling-2.56.1}/docling/backend/html_backend.py +36 -15
- {docling-2.55.1 → docling-2.56.1}/docling/backend/msexcel_backend.py +13 -9
- {docling-2.55.1 → docling-2.56.1}/docling/cli/main.py +33 -8
- {docling-2.55.1 → docling-2.56.1}/docling/cli/models.py +3 -1
- {docling-2.55.1 → docling-2.56.1}/docling/datamodel/pipeline_options.py +15 -1
- docling-2.56.1/docling/models/auto_ocr_model.py +132 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/base_model.py +2 -2
- {docling-2.55.1 → docling-2.56.1}/docling/models/plugins/defaults.py +2 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/rapid_ocr_model.py +126 -5
- {docling-2.55.1 → docling-2.56.1}/docling/models/tesseract_ocr_cli_model.py +4 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/tesseract_ocr_model.py +15 -5
- {docling-2.55.1 → docling-2.56.1}/docling/pipeline/asr_pipeline.py +53 -6
- {docling-2.55.1 → docling-2.56.1}/docling/utils/model_downloader.py +13 -1
- {docling-2.55.1 → docling-2.56.1}/docling.egg-info/PKG-INFO +5 -3
- {docling-2.55.1 → docling-2.56.1}/docling.egg-info/SOURCES.txt +1 -0
- {docling-2.55.1 → docling-2.56.1}/docling.egg-info/requires.txt +9 -2
- {docling-2.55.1 → docling-2.56.1}/pyproject.toml +6 -3
- {docling-2.55.1 → docling-2.56.1}/tests/test_asr_pipeline.py +26 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_msexcel.py +17 -2
- {docling-2.55.1 → docling-2.56.1}/LICENSE +0 -0
- {docling-2.55.1 → docling-2.56.1}/README.md +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/abstract_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/csv_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/docx/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/json/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/md_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/mets_gbs_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/msword_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/noop_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/pdf_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/webvtt_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/xml/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/chunking/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/cli/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/cli/tools.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/datamodel/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/datamodel/base_models.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/datamodel/document.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/datamodel/extraction.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/datamodel/settings.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/document_converter.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/document_extractor.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/exceptions.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/api_vlm_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/base_ocr_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/code_formula_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/easyocr_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/factories/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/factories/base_factory.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/layout_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/page_assemble_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/plugins/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/readingorder_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/table_structure_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/utils/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/utils/generation_utils.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/vlm_models_inline/nuextract_transformers_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/models/vlm_models_inline/vllm_model.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/pipeline/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/pipeline/base_extraction_pipeline.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/pipeline/extraction_vlm_pipeline.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/py.typed +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/utils/__init__.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/utils/api_image_request.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/utils/export.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/utils/glm_utils.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/utils/locks.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/utils/ocr_utils.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/utils/orientation.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/utils/profiling.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/utils/utils.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling/utils/visualization.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.55.1 → docling-2.56.1}/docling.egg-info/top_level.txt +0 -0
- {docling-2.55.1 → docling-2.56.1}/setup.cfg +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_csv.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_docling_json.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_html.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_jats.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_markdown.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_mets_gbs.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_msword.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_pdfium.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_pptx.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_vtt.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_backend_webp.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_cli.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_code_formula.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_data_gen_flag.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_e2e_conversion.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_extraction.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_input_doc.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_interfaces.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_invalid_input.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_ocr_utils.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_options.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_settings_load.py +0 -0
- {docling-2.55.1 → docling-2.56.1}/tests/test_threaded_pipeline.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.56.1
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -34,7 +34,8 @@ Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
|
|
34
34
|
Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
|
|
35
35
|
Requires-Dist: huggingface_hub<1,>=0.23
|
|
36
36
|
Requires-Dist: requests<3.0.0,>=2.32.2
|
|
37
|
-
Requires-Dist:
|
|
37
|
+
Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin"
|
|
38
|
+
Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14"
|
|
38
39
|
Requires-Dist: certifi>=2024.7.4
|
|
39
40
|
Requires-Dist: rtree<2.0.0,>=1.3.0
|
|
40
41
|
Requires-Dist: typer<0.20.0,>=0.12.5
|
|
@@ -52,6 +53,8 @@ Requires-Dist: pylatexenc<3.0,>=2.10
|
|
|
52
53
|
Requires-Dist: scipy<2.0.0,>=1.6.0
|
|
53
54
|
Requires-Dist: accelerate<2,>=1.0.0
|
|
54
55
|
Requires-Dist: polyfactory>=2.22.2
|
|
56
|
+
Provides-Extra: easyocr
|
|
57
|
+
Requires-Dist: easyocr<2.0,>=1.7; extra == "easyocr"
|
|
55
58
|
Provides-Extra: tesserocr
|
|
56
59
|
Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
|
|
57
60
|
Provides-Extra: ocrmac
|
|
@@ -65,7 +68,6 @@ Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
|
|
|
65
68
|
Provides-Extra: rapidocr
|
|
66
69
|
Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
|
|
67
70
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
|
68
|
-
Requires-Dist: modelscope>=1.29.0; extra == "rapidocr"
|
|
69
71
|
Provides-Extra: asr
|
|
70
72
|
Requires-Dist: openai-whisper>=20250625; extra == "asr"
|
|
71
73
|
Dynamic: license-file
|
|
@@ -272,9 +272,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
272
272
|
for br in content("br"):
|
|
273
273
|
br.replace_with(NavigableString("\n"))
|
|
274
274
|
# set default content layer
|
|
275
|
-
|
|
275
|
+
|
|
276
|
+
# Furniture before the first heading rule, except for headers in tables
|
|
277
|
+
header = None
|
|
278
|
+
# Find all headers first
|
|
279
|
+
all_headers = content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
|
|
280
|
+
# Keep only those that do NOT have a <table> in a parent chain
|
|
281
|
+
clean_headers = [h for h in all_headers if not h.find_parent("table")]
|
|
282
|
+
# Pick the first header from the remaining
|
|
283
|
+
if len(clean_headers):
|
|
284
|
+
header = clean_headers[0]
|
|
285
|
+
# Set starting content layer
|
|
276
286
|
self.content_layer = (
|
|
277
|
-
ContentLayer.BODY if
|
|
287
|
+
ContentLayer.BODY if header is None else ContentLayer.FURNITURE
|
|
278
288
|
)
|
|
279
289
|
# reset context
|
|
280
290
|
self.ctx = _Context()
|
|
@@ -309,9 +319,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
309
319
|
group_name: str,
|
|
310
320
|
doc: DoclingDocument,
|
|
311
321
|
docling_table: TableItem,
|
|
312
|
-
) -> tuple[bool, RefItem]:
|
|
322
|
+
) -> tuple[bool, Union[RefItem, None]]:
|
|
313
323
|
rich_table_cell = False
|
|
314
|
-
ref_for_rich_cell =
|
|
324
|
+
ref_for_rich_cell = None
|
|
325
|
+
if len(provs_in_cell) > 0:
|
|
326
|
+
ref_for_rich_cell = provs_in_cell[0]
|
|
315
327
|
if len(provs_in_cell) > 1:
|
|
316
328
|
# Cell has multiple elements, we need to group them
|
|
317
329
|
rich_table_cell = True
|
|
@@ -324,7 +336,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
324
336
|
if isinstance(pr_item, TextItem):
|
|
325
337
|
# Cell has only one element and it's just a text
|
|
326
338
|
rich_table_cell = False
|
|
327
|
-
|
|
339
|
+
try:
|
|
340
|
+
doc.delete_items(node_items=[pr_item])
|
|
341
|
+
except Exception as e:
|
|
342
|
+
_log.error(f"Error while making rich table: {e}.")
|
|
328
343
|
else:
|
|
329
344
|
rich_table_cell = True
|
|
330
345
|
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
|
|
@@ -391,17 +406,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
391
406
|
|
|
392
407
|
provs_in_cell: list[RefItem] = []
|
|
393
408
|
# Parse table cell sub-tree for Rich Cells content:
|
|
409
|
+
table_level = self.level
|
|
394
410
|
provs_in_cell = self._walk(html_cell, doc)
|
|
411
|
+
# After walking sub-tree in cell, restore previously set level
|
|
412
|
+
self.level = table_level
|
|
395
413
|
|
|
396
414
|
rich_table_cell = False
|
|
397
415
|
ref_for_rich_cell = None
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
provs_in_cell, group_name, doc, docling_table
|
|
403
|
-
)
|
|
416
|
+
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
|
|
417
|
+
rich_table_cell, ref_for_rich_cell = (
|
|
418
|
+
HTMLDocumentBackend.process_rich_table_cells(
|
|
419
|
+
provs_in_cell, group_name, doc, docling_table
|
|
404
420
|
)
|
|
421
|
+
)
|
|
405
422
|
|
|
406
423
|
# Extracting text
|
|
407
424
|
text = self.get_text(html_cell).strip()
|
|
@@ -774,13 +791,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
774
791
|
for key in self.parents.keys():
|
|
775
792
|
self.parents[key] = None
|
|
776
793
|
self.level = 0
|
|
777
|
-
|
|
794
|
+
self.parents[self.level + 1] = doc.add_title(
|
|
778
795
|
text_clean,
|
|
779
796
|
content_layer=self.content_layer,
|
|
780
797
|
formatting=annotated_text.formatting,
|
|
781
798
|
hyperlink=annotated_text.hyperlink,
|
|
782
799
|
)
|
|
783
|
-
|
|
800
|
+
p1 = self.parents[self.level + 1]
|
|
801
|
+
if p1 is not None:
|
|
802
|
+
added_ref = [p1.get_ref()]
|
|
784
803
|
# the other levels need to be lowered by 1 if a title was set
|
|
785
804
|
else:
|
|
786
805
|
level -= 1
|
|
@@ -802,7 +821,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
802
821
|
_log.debug(f"Remove the tail of level {key}")
|
|
803
822
|
self.parents[key] = None
|
|
804
823
|
self.level = level
|
|
805
|
-
|
|
824
|
+
self.parents[self.level + 1] = doc.add_heading(
|
|
806
825
|
parent=self.parents[self.level],
|
|
807
826
|
text=text_clean,
|
|
808
827
|
orig=annotated_text.text,
|
|
@@ -811,7 +830,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
811
830
|
formatting=annotated_text.formatting,
|
|
812
831
|
hyperlink=annotated_text.hyperlink,
|
|
813
832
|
)
|
|
814
|
-
|
|
833
|
+
p2 = self.parents[self.level + 1]
|
|
834
|
+
if p2 is not None:
|
|
835
|
+
added_ref = [p2.get_ref()]
|
|
815
836
|
self.level += 1
|
|
816
837
|
for img_tag in tag("img"):
|
|
817
838
|
if isinstance(img_tag, Tag):
|
|
@@ -18,6 +18,7 @@ from docling_core.types.doc import (
|
|
|
18
18
|
TableData,
|
|
19
19
|
)
|
|
20
20
|
from openpyxl import load_workbook
|
|
21
|
+
from openpyxl.chartsheet.chartsheet import Chartsheet
|
|
21
22
|
from openpyxl.drawing.image import Image
|
|
22
23
|
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
|
|
23
24
|
from openpyxl.worksheet.worksheet import Worksheet
|
|
@@ -186,18 +187,18 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
186
187
|
|
|
187
188
|
if self.workbook is not None:
|
|
188
189
|
# Iterate over all sheets
|
|
189
|
-
for
|
|
190
|
-
_log.info(f"Processing sheet: {
|
|
190
|
+
for idx, name in enumerate(self.workbook.sheetnames):
|
|
191
|
+
_log.info(f"Processing sheet {idx}: {name}")
|
|
191
192
|
|
|
192
|
-
sheet = self.workbook[
|
|
193
|
-
page_no =
|
|
193
|
+
sheet = self.workbook[name]
|
|
194
|
+
page_no = idx + 1
|
|
194
195
|
# do not rely on sheet.max_column, sheet.max_row if there are images
|
|
195
196
|
page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
|
|
196
197
|
|
|
197
198
|
self.parents[0] = doc.add_group(
|
|
198
199
|
parent=None,
|
|
199
200
|
label=GroupLabel.SECTION,
|
|
200
|
-
name=f"sheet: {
|
|
201
|
+
name=f"sheet: {name}",
|
|
201
202
|
content_layer=self._get_sheet_content_layer(sheet),
|
|
202
203
|
)
|
|
203
204
|
doc = self._convert_sheet(doc, sheet)
|
|
@@ -208,7 +209,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
208
209
|
|
|
209
210
|
return doc
|
|
210
211
|
|
|
211
|
-
def _convert_sheet(
|
|
212
|
+
def _convert_sheet(
|
|
213
|
+
self, doc: DoclingDocument, sheet: Union[Worksheet, Chartsheet]
|
|
214
|
+
) -> DoclingDocument:
|
|
212
215
|
"""Parse an Excel worksheet and attach its structure to a DoclingDocument
|
|
213
216
|
|
|
214
217
|
Args:
|
|
@@ -218,10 +221,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
218
221
|
Returns:
|
|
219
222
|
The updated DoclingDocument.
|
|
220
223
|
"""
|
|
224
|
+
if isinstance(sheet, Worksheet):
|
|
225
|
+
doc = self._find_tables_in_sheet(doc, sheet)
|
|
226
|
+
doc = self._find_images_in_sheet(doc, sheet)
|
|
221
227
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
doc = self._find_images_in_sheet(doc, sheet)
|
|
228
|
+
# TODO: parse charts in sheet
|
|
225
229
|
|
|
226
230
|
return doc
|
|
227
231
|
|
|
@@ -49,7 +49,7 @@ from docling.datamodel.document import ConversionResult
|
|
|
49
49
|
from docling.datamodel.pipeline_options import (
|
|
50
50
|
AsrPipelineOptions,
|
|
51
51
|
ConvertPipelineOptions,
|
|
52
|
-
|
|
52
|
+
OcrAutoOptions,
|
|
53
53
|
OcrOptions,
|
|
54
54
|
PaginatedPipelineOptions,
|
|
55
55
|
PdfBackend,
|
|
@@ -57,6 +57,8 @@ from docling.datamodel.pipeline_options import (
|
|
|
57
57
|
PipelineOptions,
|
|
58
58
|
ProcessingPipeline,
|
|
59
59
|
TableFormerMode,
|
|
60
|
+
TesseractCliOcrOptions,
|
|
61
|
+
TesseractOcrOptions,
|
|
60
62
|
VlmPipelineOptions,
|
|
61
63
|
)
|
|
62
64
|
from docling.datamodel.settings import settings
|
|
@@ -372,7 +374,7 @@ def convert( # noqa: C901
|
|
|
372
374
|
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
|
373
375
|
),
|
|
374
376
|
),
|
|
375
|
-
] =
|
|
377
|
+
] = OcrAutoOptions.kind,
|
|
376
378
|
ocr_lang: Annotated[
|
|
377
379
|
Optional[str],
|
|
378
380
|
typer.Option(
|
|
@@ -380,6 +382,13 @@ def convert( # noqa: C901
|
|
|
380
382
|
help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
|
|
381
383
|
),
|
|
382
384
|
] = None,
|
|
385
|
+
psm: Annotated[
|
|
386
|
+
Optional[int],
|
|
387
|
+
typer.Option(
|
|
388
|
+
...,
|
|
389
|
+
help="Page Segmentation Mode for the OCR engine (0-13).",
|
|
390
|
+
),
|
|
391
|
+
] = None,
|
|
383
392
|
pdf_backend: Annotated[
|
|
384
393
|
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
|
385
394
|
] = PdfBackend.DLPARSE_V2,
|
|
@@ -547,13 +556,25 @@ def convert( # noqa: C901
|
|
|
547
556
|
if local_path.exists() and local_path.is_dir():
|
|
548
557
|
for fmt in from_formats:
|
|
549
558
|
for ext in FormatToExtensions[fmt]:
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
559
|
+
for path in local_path.glob(f"**/*.{ext}"):
|
|
560
|
+
if path.name.startswith("~$") and ext == "docx":
|
|
561
|
+
_log.info(
|
|
562
|
+
f"Ignoring temporary Word file: {path}"
|
|
563
|
+
)
|
|
564
|
+
continue
|
|
565
|
+
input_doc_paths.append(path)
|
|
566
|
+
|
|
567
|
+
for path in local_path.glob(f"**/*.{ext.upper()}"):
|
|
568
|
+
if path.name.startswith("~$") and ext == "docx":
|
|
569
|
+
_log.info(
|
|
570
|
+
f"Ignoring temporary Word file: {path}"
|
|
571
|
+
)
|
|
572
|
+
continue
|
|
573
|
+
input_doc_paths.append(path)
|
|
556
574
|
elif local_path.exists():
|
|
575
|
+
if not local_path.name.startswith("~$") and ext == "docx":
|
|
576
|
+
_log.info(f"Ignoring temporary Word file: {path}")
|
|
577
|
+
continue
|
|
557
578
|
input_doc_paths.append(local_path)
|
|
558
579
|
else:
|
|
559
580
|
err_console.print(
|
|
@@ -584,6 +605,10 @@ def convert( # noqa: C901
|
|
|
584
605
|
ocr_lang_list = _split_list(ocr_lang)
|
|
585
606
|
if ocr_lang_list is not None:
|
|
586
607
|
ocr_options.lang = ocr_lang_list
|
|
608
|
+
if psm is not None and isinstance(
|
|
609
|
+
ocr_options, (TesseractOcrOptions, TesseractCliOcrOptions)
|
|
610
|
+
):
|
|
611
|
+
ocr_options.psm = psm
|
|
587
612
|
|
|
588
613
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
|
589
614
|
# pipeline_options: PaginatedPipelineOptions
|
|
@@ -38,6 +38,7 @@ class _AvailableModels(str, Enum):
|
|
|
38
38
|
SMOLDOCLING = "smoldocling"
|
|
39
39
|
SMOLDOCLING_MLX = "smoldocling_mlx"
|
|
40
40
|
GRANITE_VISION = "granite_vision"
|
|
41
|
+
RAPIDOCR = "rapidocr"
|
|
41
42
|
EASYOCR = "easyocr"
|
|
42
43
|
|
|
43
44
|
|
|
@@ -46,7 +47,7 @@ _default_models = [
|
|
|
46
47
|
_AvailableModels.TABLEFORMER,
|
|
47
48
|
_AvailableModels.CODE_FORMULA,
|
|
48
49
|
_AvailableModels.PICTURE_CLASSIFIER,
|
|
49
|
-
_AvailableModels.
|
|
50
|
+
_AvailableModels.RAPIDOCR,
|
|
50
51
|
]
|
|
51
52
|
|
|
52
53
|
|
|
@@ -115,6 +116,7 @@ def download(
|
|
|
115
116
|
with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
|
|
116
117
|
with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
|
|
117
118
|
with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
|
|
119
|
+
with_rapidocr=_AvailableModels.RAPIDOCR in to_download,
|
|
118
120
|
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
|
119
121
|
)
|
|
120
122
|
|
|
@@ -81,6 +81,13 @@ class OcrOptions(BaseOptions):
|
|
|
81
81
|
)
|
|
82
82
|
|
|
83
83
|
|
|
84
|
+
class OcrAutoOptions(OcrOptions):
|
|
85
|
+
"""Options for pick OCR engine automatically."""
|
|
86
|
+
|
|
87
|
+
kind: ClassVar[Literal["auto"]] = "auto"
|
|
88
|
+
lang: List[str] = []
|
|
89
|
+
|
|
90
|
+
|
|
84
91
|
class RapidOcrOptions(OcrOptions):
|
|
85
92
|
"""Options for the RapidOCR engine."""
|
|
86
93
|
|
|
@@ -154,6 +161,9 @@ class TesseractCliOcrOptions(OcrOptions):
|
|
|
154
161
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
155
162
|
tesseract_cmd: str = "tesseract"
|
|
156
163
|
path: Optional[str] = None
|
|
164
|
+
psm: Optional[int] = (
|
|
165
|
+
None # Page Segmentation Mode (0-13), defaults to tesseract's default
|
|
166
|
+
)
|
|
157
167
|
|
|
158
168
|
model_config = ConfigDict(
|
|
159
169
|
extra="forbid",
|
|
@@ -166,6 +176,9 @@ class TesseractOcrOptions(OcrOptions):
|
|
|
166
176
|
kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
|
|
167
177
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
168
178
|
path: Optional[str] = None
|
|
179
|
+
psm: Optional[int] = (
|
|
180
|
+
None # Page Segmentation Mode (0-13), defaults to tesseract's default
|
|
181
|
+
)
|
|
169
182
|
|
|
170
183
|
model_config = ConfigDict(
|
|
171
184
|
extra="forbid",
|
|
@@ -249,6 +262,7 @@ class PdfBackend(str, Enum):
|
|
|
249
262
|
class OcrEngine(str, Enum):
|
|
250
263
|
"""Enum of valid OCR engines."""
|
|
251
264
|
|
|
265
|
+
AUTO = "auto"
|
|
252
266
|
EASYOCR = "easyocr"
|
|
253
267
|
TESSERACT_CLI = "tesseract_cli"
|
|
254
268
|
TESSERACT = "tesseract"
|
|
@@ -330,7 +344,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
|
330
344
|
# If True, text from backend will be used instead of generated text
|
|
331
345
|
|
|
332
346
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
|
333
|
-
ocr_options: OcrOptions =
|
|
347
|
+
ocr_options: OcrOptions = OcrAutoOptions()
|
|
334
348
|
layout_options: LayoutOptions = LayoutOptions()
|
|
335
349
|
|
|
336
350
|
images_scale: float = 1.0
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Type
|
|
6
|
+
|
|
7
|
+
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
8
|
+
from docling.datamodel.base_models import Page
|
|
9
|
+
from docling.datamodel.document import ConversionResult
|
|
10
|
+
from docling.datamodel.pipeline_options import (
|
|
11
|
+
EasyOcrOptions,
|
|
12
|
+
OcrAutoOptions,
|
|
13
|
+
OcrMacOptions,
|
|
14
|
+
OcrOptions,
|
|
15
|
+
RapidOcrOptions,
|
|
16
|
+
)
|
|
17
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
|
18
|
+
from docling.models.easyocr_model import EasyOcrModel
|
|
19
|
+
from docling.models.ocr_mac_model import OcrMacModel
|
|
20
|
+
from docling.models.rapid_ocr_model import RapidOcrModel
|
|
21
|
+
|
|
22
|
+
_log = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class OcrAutoModel(BaseOcrModel):
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
enabled: bool,
|
|
29
|
+
artifacts_path: Optional[Path],
|
|
30
|
+
options: OcrAutoOptions,
|
|
31
|
+
accelerator_options: AcceleratorOptions,
|
|
32
|
+
):
|
|
33
|
+
super().__init__(
|
|
34
|
+
enabled=enabled,
|
|
35
|
+
artifacts_path=artifacts_path,
|
|
36
|
+
options=options,
|
|
37
|
+
accelerator_options=accelerator_options,
|
|
38
|
+
)
|
|
39
|
+
self.options: OcrAutoOptions
|
|
40
|
+
|
|
41
|
+
self._engine: Optional[BaseOcrModel] = None
|
|
42
|
+
if self.enabled:
|
|
43
|
+
if "darwin" == sys.platform:
|
|
44
|
+
try:
|
|
45
|
+
from ocrmac import ocrmac
|
|
46
|
+
|
|
47
|
+
self._engine = OcrMacModel(
|
|
48
|
+
enabled=self.enabled,
|
|
49
|
+
artifacts_path=artifacts_path,
|
|
50
|
+
options=OcrMacOptions(
|
|
51
|
+
bitmap_area_threshold=self.options.bitmap_area_threshold,
|
|
52
|
+
force_full_page_ocr=self.options.force_full_page_ocr,
|
|
53
|
+
),
|
|
54
|
+
accelerator_options=accelerator_options,
|
|
55
|
+
)
|
|
56
|
+
_log.info("Auto OCR model selected ocrmac.")
|
|
57
|
+
except ImportError:
|
|
58
|
+
_log.info("ocrmac cannot be used because ocrmac is not installed.")
|
|
59
|
+
|
|
60
|
+
if self._engine is None:
|
|
61
|
+
try:
|
|
62
|
+
import onnxruntime
|
|
63
|
+
from rapidocr import EngineType, RapidOCR # type: ignore
|
|
64
|
+
|
|
65
|
+
self._engine = RapidOcrModel(
|
|
66
|
+
enabled=self.enabled,
|
|
67
|
+
artifacts_path=artifacts_path,
|
|
68
|
+
options=RapidOcrOptions(
|
|
69
|
+
backend="onnxruntime",
|
|
70
|
+
bitmap_area_threshold=self.options.bitmap_area_threshold,
|
|
71
|
+
force_full_page_ocr=self.options.force_full_page_ocr,
|
|
72
|
+
),
|
|
73
|
+
accelerator_options=accelerator_options,
|
|
74
|
+
)
|
|
75
|
+
_log.info("Auto OCR model selected rapidocr with onnxruntime.")
|
|
76
|
+
except ImportError:
|
|
77
|
+
_log.info(
|
|
78
|
+
"rapidocr cannot be used because onnxruntime is not installed."
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if self._engine is None:
|
|
82
|
+
try:
|
|
83
|
+
import easyocr
|
|
84
|
+
|
|
85
|
+
self._engine = EasyOcrModel(
|
|
86
|
+
enabled=self.enabled,
|
|
87
|
+
artifacts_path=artifacts_path,
|
|
88
|
+
options=EasyOcrOptions(
|
|
89
|
+
bitmap_area_threshold=self.options.bitmap_area_threshold,
|
|
90
|
+
force_full_page_ocr=self.options.force_full_page_ocr,
|
|
91
|
+
),
|
|
92
|
+
accelerator_options=accelerator_options,
|
|
93
|
+
)
|
|
94
|
+
_log.info("Auto OCR model selected easyocr.")
|
|
95
|
+
except ImportError:
|
|
96
|
+
_log.info("easyocr cannot be used because it is not installed.")
|
|
97
|
+
|
|
98
|
+
if self._engine is None:
|
|
99
|
+
try:
|
|
100
|
+
import torch
|
|
101
|
+
from rapidocr import EngineType, RapidOCR # type: ignore
|
|
102
|
+
|
|
103
|
+
self._engine = RapidOcrModel(
|
|
104
|
+
enabled=self.enabled,
|
|
105
|
+
artifacts_path=artifacts_path,
|
|
106
|
+
options=RapidOcrOptions(
|
|
107
|
+
backend="torch",
|
|
108
|
+
bitmap_area_threshold=self.options.bitmap_area_threshold,
|
|
109
|
+
force_full_page_ocr=self.options.force_full_page_ocr,
|
|
110
|
+
),
|
|
111
|
+
accelerator_options=accelerator_options,
|
|
112
|
+
)
|
|
113
|
+
_log.info("Auto OCR model selected rapidocr with torch.")
|
|
114
|
+
except ImportError:
|
|
115
|
+
_log.info(
|
|
116
|
+
"rapidocr cannot be used because rapidocr or torch is not installed."
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
if self._engine is None:
|
|
120
|
+
_log.warning("No OCR engine found. Please review the install details.")
|
|
121
|
+
|
|
122
|
+
def __call__(
|
|
123
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
124
|
+
) -> Iterable[Page]:
|
|
125
|
+
if not self.enabled or self._engine is None:
|
|
126
|
+
yield from page_batch
|
|
127
|
+
return
|
|
128
|
+
yield from self._engine(conv_res, page_batch)
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
|
132
|
+
return OcrAutoOptions
|
|
@@ -173,11 +173,11 @@ class BaseItemAndImageEnrichmentModel(
|
|
|
173
173
|
assert isinstance(element, DocItem)
|
|
174
174
|
|
|
175
175
|
# Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
|
|
176
|
-
if
|
|
176
|
+
if isinstance(element, PictureItem):
|
|
177
177
|
embedded_im = element.get_image(conv_res.document)
|
|
178
178
|
if embedded_im is not None:
|
|
179
179
|
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
|
|
180
|
-
|
|
180
|
+
elif len(element.prov) == 0:
|
|
181
181
|
return None
|
|
182
182
|
|
|
183
183
|
# Crop the image form the page
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
def ocr_engines():
|
|
2
|
+
from docling.models.auto_ocr_model import OcrAutoModel
|
|
2
3
|
from docling.models.easyocr_model import EasyOcrModel
|
|
3
4
|
from docling.models.ocr_mac_model import OcrMacModel
|
|
4
5
|
from docling.models.rapid_ocr_model import RapidOcrModel
|
|
@@ -7,6 +8,7 @@ def ocr_engines():
|
|
|
7
8
|
|
|
8
9
|
return {
|
|
9
10
|
"ocr_engines": [
|
|
11
|
+
OcrAutoModel,
|
|
10
12
|
EasyOcrModel,
|
|
11
13
|
OcrMacModel,
|
|
12
14
|
RapidOcrModel,
|