docling 2.39.0__tar.gz → 2.66.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.39.0 → docling-2.66.0}/PKG-INFO +38 -19
- {docling-2.39.0 → docling-2.66.0}/README.md +17 -7
- {docling-2.39.0 → docling-2.66.0}/docling/backend/abstract_backend.py +24 -3
- {docling-2.39.0 → docling-2.66.0}/docling/backend/asciidoc_backend.py +4 -4
- {docling-2.39.0 → docling-2.66.0}/docling/backend/docling_parse_v4_backend.py +103 -36
- docling-2.66.0/docling/backend/docx/drawingml/utils.py +131 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/docx/latex/latex_dict.py +5 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/docx/latex/omml.py +11 -2
- docling-2.66.0/docling/backend/html_backend.py +1502 -0
- docling-2.66.0/docling/backend/image_backend.py +188 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/md_backend.py +82 -18
- docling-2.66.0/docling/backend/mets_gbs_backend.py +399 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/msexcel_backend.py +252 -85
- {docling-2.39.0 → docling-2.66.0}/docling/backend/mspowerpoint_backend.py +2 -2
- {docling-2.39.0 → docling-2.66.0}/docling/backend/msword_backend.py +640 -151
- {docling-2.39.0 → docling-2.66.0}/docling/backend/pdf_backend.py +14 -14
- {docling-2.39.0 → docling-2.66.0}/docling/backend/pypdfium2_backend.py +39 -8
- docling-2.66.0/docling/backend/webvtt_backend.py +572 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/xml/jats_backend.py +123 -11
- {docling-2.39.0 → docling-2.66.0}/docling/backend/xml/uspto_backend.py +1 -1
- {docling-2.39.0 → docling-2.66.0}/docling/cli/main.py +257 -64
- {docling-2.39.0 → docling-2.66.0}/docling/cli/models.py +63 -1
- docling-2.66.0/docling/datamodel/asr_model_specs.py +494 -0
- docling-2.66.0/docling/datamodel/backend_options.py +102 -0
- {docling-2.39.0 → docling-2.66.0}/docling/datamodel/base_models.py +103 -26
- {docling-2.39.0 → docling-2.66.0}/docling/datamodel/document.py +292 -52
- docling-2.66.0/docling/datamodel/extraction.py +39 -0
- docling-2.66.0/docling/datamodel/layout_model_specs.py +90 -0
- {docling-2.39.0 → docling-2.66.0}/docling/datamodel/pipeline_options.py +172 -32
- {docling-2.39.0 → docling-2.66.0}/docling/datamodel/pipeline_options_asr_model.py +21 -1
- {docling-2.39.0 → docling-2.66.0}/docling/datamodel/pipeline_options_vlm_model.py +56 -3
- {docling-2.39.0 → docling-2.66.0}/docling/datamodel/settings.py +7 -12
- docling-2.66.0/docling/datamodel/vlm_model_specs.py +330 -0
- docling-2.66.0/docling/document_converter.py +559 -0
- docling-2.66.0/docling/document_extractor.py +327 -0
- docling-2.66.0/docling/experimental/__init__.py +5 -0
- docling-2.66.0/docling/experimental/datamodel/__init__.py +1 -0
- docling-2.66.0/docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling-2.66.0/docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling-2.66.0/docling/experimental/models/__init__.py +3 -0
- docling-2.66.0/docling/experimental/models/table_crops_layout_model.py +114 -0
- docling-2.66.0/docling/experimental/pipeline/__init__.py +1 -0
- docling-2.66.0/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling-2.66.0/docling/models/api_vlm_model.py +180 -0
- docling-2.66.0/docling/models/auto_ocr_model.py +132 -0
- docling-2.66.0/docling/models/base_layout_model.py +39 -0
- docling-2.66.0/docling/models/base_model.py +230 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/base_ocr_model.py +20 -2
- docling-2.66.0/docling/models/base_table_model.py +45 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/code_formula_model.py +87 -76
- {docling-2.39.0 → docling-2.66.0}/docling/models/document_picture_classifier.py +14 -15
- {docling-2.39.0 → docling-2.66.0}/docling/models/easyocr_model.py +19 -9
- {docling-2.39.0 → docling-2.66.0}/docling/models/factories/__init__.py +20 -0
- docling-2.66.0/docling/models/factories/layout_factory.py +7 -0
- docling-2.66.0/docling/models/factories/table_factory.py +7 -0
- docling-2.66.0/docling/models/layout_model.py +249 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/page_preprocessing_model.py +6 -2
- {docling-2.39.0 → docling-2.66.0}/docling/models/picture_description_api_model.py +3 -1
- {docling-2.39.0 → docling-2.66.0}/docling/models/picture_description_vlm_model.py +23 -11
- docling-2.66.0/docling/models/plugins/defaults.py +54 -0
- docling-2.66.0/docling/models/rapid_ocr_model.py +328 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/readingorder_model.py +71 -14
- docling-2.66.0/docling/models/table_structure_model.py +305 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/tesseract_ocr_cli_model.py +8 -2
- {docling-2.39.0 → docling-2.66.0}/docling/models/tesseract_ocr_model.py +23 -9
- docling-2.66.0/docling/models/utils/generation_utils.py +157 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/utils/hf_model_download.py +6 -1
- docling-2.66.0/docling/models/vlm_models_inline/hf_transformers_model.py +391 -0
- docling-2.66.0/docling/models/vlm_models_inline/mlx_model.py +330 -0
- docling-2.66.0/docling/models/vlm_models_inline/nuextract_transformers_model.py +305 -0
- docling-2.66.0/docling/models/vlm_models_inline/vllm_model.py +344 -0
- {docling-2.39.0 → docling-2.66.0}/docling/pipeline/asr_pipeline.py +203 -25
- docling-2.66.0/docling/pipeline/base_extraction_pipeline.py +72 -0
- {docling-2.39.0 → docling-2.66.0}/docling/pipeline/base_pipeline.py +107 -18
- docling-2.66.0/docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling-2.39.0/docling/pipeline/standard_pdf_pipeline.py → docling-2.66.0/docling/pipeline/legacy_standard_pdf_pipeline.py +31 -69
- {docling-2.39.0 → docling-2.66.0}/docling/pipeline/simple_pipeline.py +6 -6
- docling-2.66.0/docling/pipeline/standard_pdf_pipeline.py +843 -0
- docling-2.66.0/docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- {docling-2.39.0 → docling-2.66.0}/docling/pipeline/vlm_pipeline.py +72 -47
- docling-2.66.0/docling/py.typed +1 -0
- {docling-2.39.0 → docling-2.66.0}/docling/utils/accelerator_utils.py +2 -2
- docling-2.66.0/docling/utils/api_image_request.py +205 -0
- {docling-2.39.0 → docling-2.66.0}/docling/utils/layout_postprocessor.py +79 -61
- {docling-2.39.0 → docling-2.66.0}/docling/utils/model_downloader.py +38 -2
- {docling-2.39.0 → docling-2.66.0}/docling/utils/ocr_utils.py +1 -1
- {docling-2.39.0 → docling-2.66.0}/docling/utils/orientation.py +22 -28
- {docling-2.39.0 → docling-2.66.0}/docling.egg-info/PKG-INFO +38 -19
- {docling-2.39.0 → docling-2.66.0}/docling.egg-info/SOURCES.txt +38 -1
- docling-2.66.0/docling.egg-info/requires.txt +66 -0
- {docling-2.39.0 → docling-2.66.0}/pyproject.toml +29 -17
- docling-2.66.0/tests/test_asr_mlx_whisper.py +340 -0
- docling-2.66.0/tests/test_asr_pipeline.py +404 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_backend_docling_parse_v4.py +17 -0
- docling-2.66.0/tests/test_backend_html.py +561 -0
- docling-2.66.0/tests/test_backend_image_native.py +218 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_backend_jats.py +14 -14
- docling-2.66.0/tests/test_backend_markdown.py +111 -0
- docling-2.66.0/tests/test_backend_mets_gbs.py +77 -0
- docling-2.66.0/tests/test_backend_msexcel.py +314 -0
- docling-2.66.0/tests/test_backend_msword.py +239 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_backend_patent_uspto.py +11 -3
- {docling-2.39.0 → docling-2.66.0}/tests/test_backend_pdfium.py +19 -0
- docling-2.66.0/tests/test_backend_vtt.py +232 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_backend_webp.py +7 -3
- docling-2.66.0/tests/test_cli.py +92 -0
- docling-2.66.0/tests/test_conversion_result_json.py +44 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_document_picture_classifier.py +2 -1
- {docling-2.39.0 → docling-2.66.0}/tests/test_e2e_conversion.py +11 -8
- {docling-2.39.0 → docling-2.66.0}/tests/test_e2e_ocr_conversion.py +28 -15
- docling-2.66.0/tests/test_extraction.py +108 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_input_doc.py +71 -34
- docling-2.66.0/tests/test_interfaces.py +138 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_legacy_format_transform.py +1 -0
- docling-2.66.0/tests/test_ocr_utils.py +80 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_options.py +28 -0
- docling-2.66.0/tests/test_pdf_password.py +63 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_settings_load.py +1 -1
- docling-2.66.0/tests/test_threaded_pipeline.py +198 -0
- docling-2.39.0/docling/backend/html_backend.py +0 -577
- docling-2.39.0/docling/datamodel/asr_model_specs.py +0 -92
- docling-2.39.0/docling/datamodel/vlm_model_specs.py +0 -144
- docling-2.39.0/docling/document_converter.py +0 -384
- docling-2.39.0/docling/models/api_vlm_model.py +0 -73
- docling-2.39.0/docling/models/base_model.py +0 -93
- docling-2.39.0/docling/models/layout_model.py +0 -210
- docling-2.39.0/docling/models/plugins/defaults.py +0 -28
- docling-2.39.0/docling/models/rapid_ocr_model.py +0 -147
- docling-2.39.0/docling/models/table_structure_model.py +0 -302
- docling-2.39.0/docling/models/vlm_models_inline/hf_transformers_model.py +0 -197
- docling-2.39.0/docling/models/vlm_models_inline/mlx_model.py +0 -149
- docling-2.39.0/docling/utils/__init__.py +0 -0
- docling-2.39.0/docling/utils/api_image_request.py +0 -61
- docling-2.39.0/docling.egg-info/requires.txt +0 -49
- docling-2.39.0/tests/test_asr_pipeline.py +0 -59
- docling-2.39.0/tests/test_backend_html.py +0 -149
- docling-2.39.0/tests/test_backend_markdown.py +0 -52
- docling-2.39.0/tests/test_backend_msexcel.py +0 -99
- docling-2.39.0/tests/test_backend_msword.py +0 -173
- docling-2.39.0/tests/test_cli.py +0 -27
- docling-2.39.0/tests/test_interfaces.py +0 -67
- {docling-2.39.0 → docling-2.66.0}/LICENSE +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/__init__.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/__init__.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/chunking/__init__.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/cli/__init__.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/cli/tools.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/exceptions.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/__init__.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/models/utils/__init__.py +0 -0
- /docling-2.39.0/docling/py.typed → /docling-2.66.0/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.39.0/docling/models/vlm_models_inline → docling-2.66.0/docling/pipeline}/__init__.py +0 -0
- {docling-2.39.0/docling/pipeline → docling-2.66.0/docling/utils}/__init__.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/utils/export.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/utils/locks.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/utils/profiling.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/utils/utils.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling/utils/visualization.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.39.0 → docling-2.66.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.39.0 → docling-2.66.0}/setup.cfg +0 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_backend_csv.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_code_formula.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.39.0 → docling-2.66.0}/tests/test_invalid_input.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.66.0
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -22,34 +22,40 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
22
22
|
Classifier: Programming Language :: Python :: 3.11
|
|
23
23
|
Classifier: Programming Language :: Python :: 3.12
|
|
24
24
|
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
25
26
|
Requires-Python: <4.0,>=3.9
|
|
26
27
|
Description-Content-Type: text/markdown
|
|
27
28
|
License-File: LICENSE
|
|
28
29
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
29
|
-
Requires-Dist: docling-core[chunking]<3.0.0,>=2.
|
|
30
|
-
Requires-Dist: docling-
|
|
31
|
-
Requires-Dist: docling-
|
|
30
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.50.1
|
|
31
|
+
Requires-Dist: docling-parse<5.0.0,>=4.7.0
|
|
32
|
+
Requires-Dist: docling-ibm-models<4,>=3.9.1
|
|
32
33
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
|
33
|
-
Requires-Dist: pypdfium2
|
|
34
|
+
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
|
34
35
|
Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
|
|
35
36
|
Requires-Dist: huggingface_hub<1,>=0.23
|
|
36
37
|
Requires-Dist: requests<3.0.0,>=2.32.2
|
|
37
|
-
Requires-Dist:
|
|
38
|
+
Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin"
|
|
39
|
+
Requires-Dist: rapidocr<4.0.0,>=3.3
|
|
38
40
|
Requires-Dist: certifi>=2024.7.4
|
|
39
41
|
Requires-Dist: rtree<2.0.0,>=1.3.0
|
|
40
|
-
Requires-Dist: typer<0.
|
|
42
|
+
Requires-Dist: typer<0.20.0,>=0.12.5
|
|
41
43
|
Requires-Dist: python-docx<2.0.0,>=1.1.2
|
|
42
44
|
Requires-Dist: python-pptx<2.0.0,>=1.0.2
|
|
43
45
|
Requires-Dist: beautifulsoup4<5.0.0,>=4.12.3
|
|
44
46
|
Requires-Dist: pandas<3.0.0,>=2.1.4
|
|
45
47
|
Requires-Dist: marko<3.0.0,>=2.1.2
|
|
46
48
|
Requires-Dist: openpyxl<4.0.0,>=3.1.5
|
|
47
|
-
Requires-Dist: lxml<
|
|
49
|
+
Requires-Dist: lxml<7.0.0,>=4.0.0
|
|
48
50
|
Requires-Dist: pillow<12.0.0,>=10.0.0
|
|
49
51
|
Requires-Dist: tqdm<5.0.0,>=4.65.0
|
|
50
52
|
Requires-Dist: pluggy<2.0.0,>=1.0.0
|
|
51
53
|
Requires-Dist: pylatexenc<3.0,>=2.10
|
|
52
54
|
Requires-Dist: scipy<2.0.0,>=1.6.0
|
|
55
|
+
Requires-Dist: accelerate<2,>=1.0.0
|
|
56
|
+
Requires-Dist: polyfactory>=2.22.2
|
|
57
|
+
Provides-Extra: easyocr
|
|
58
|
+
Requires-Dist: easyocr<2.0,>=1.7; extra == "easyocr"
|
|
53
59
|
Provides-Extra: tesserocr
|
|
54
60
|
Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
|
|
55
61
|
Provides-Extra: ocrmac
|
|
@@ -57,12 +63,15 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
|
|
|
57
63
|
Provides-Extra: vlm
|
|
58
64
|
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
|
59
65
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
|
60
|
-
Requires-Dist: mlx-vlm
|
|
66
|
+
Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and python_version < "3.14" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
|
67
|
+
Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and python_version < "3.14" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
|
|
68
|
+
Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
|
|
61
69
|
Provides-Extra: rapidocr
|
|
62
|
-
Requires-Dist: rapidocr
|
|
63
|
-
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
|
70
|
+
Requires-Dist: rapidocr<4.0.0,>=3.3; extra == "rapidocr"
|
|
71
|
+
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; python_version < "3.14" and extra == "rapidocr"
|
|
64
72
|
Provides-Extra: asr
|
|
65
|
-
Requires-Dist:
|
|
73
|
+
Requires-Dist: mlx-whisper>=0.4.3; (python_version >= "3.10" and python_version < "3.14" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "asr"
|
|
74
|
+
Requires-Dist: openai-whisper>=20250625; python_version < "3.14" and extra == "asr"
|
|
66
75
|
Dynamic: license-file
|
|
67
76
|
|
|
68
77
|
<p align="center">
|
|
@@ -88,6 +97,8 @@ Dynamic: license-file
|
|
|
88
97
|
[](https://opensource.org/licenses/MIT)
|
|
89
98
|
[](https://pepy.tech/projects/docling)
|
|
90
99
|
[](https://apify.com/vancura/docling)
|
|
100
|
+
[](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
|
|
101
|
+
[](https://docling.ai/discord)
|
|
91
102
|
[](https://www.bestpractices.dev/projects/10101)
|
|
92
103
|
[](https://lfaidata.foundation/projects/)
|
|
93
104
|
|
|
@@ -95,17 +106,24 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
|
95
106
|
|
|
96
107
|
## Features
|
|
97
108
|
|
|
98
|
-
* 🗂️
|
|
109
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
|
|
99
110
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
|
100
111
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
|
101
|
-
* ↪️
|
|
112
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
|
102
113
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
|
103
114
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
|
104
115
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
|
105
|
-
* 👓 Support of several Visual Language Models ([
|
|
106
|
-
* 🎙️
|
|
116
|
+
* 👓 Support of several Visual Language Models ([GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M))
|
|
117
|
+
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
|
118
|
+
* 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
|
|
107
119
|
* 💻 Simple and convenient CLI
|
|
108
120
|
|
|
121
|
+
### What's new
|
|
122
|
+
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
|
123
|
+
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
|
|
124
|
+
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
|
|
125
|
+
* 💬 Parsing of Web Video Text Tracks (WebVTT) files
|
|
126
|
+
|
|
109
127
|
### Coming soon
|
|
110
128
|
|
|
111
129
|
* 📝 Metadata extraction, including title, authors, references & language
|
|
@@ -136,7 +154,7 @@ result = converter.convert(source)
|
|
|
136
154
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
|
137
155
|
```
|
|
138
156
|
|
|
139
|
-
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
|
157
|
+
More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
|
|
140
158
|
the docs.
|
|
141
159
|
|
|
142
160
|
## CLI
|
|
@@ -147,9 +165,9 @@ Docling has a built-in CLI to run conversions.
|
|
|
147
165
|
docling https://arxiv.org/pdf/2206.01062
|
|
148
166
|
```
|
|
149
167
|
|
|
150
|
-
You can also use 🥚[
|
|
168
|
+
You can also use 🥚[GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M) and other VLMs via Docling CLI:
|
|
151
169
|
```bash
|
|
152
|
-
docling --pipeline vlm --vlm-model
|
|
170
|
+
docling --pipeline vlm --vlm-model granite_docling https://arxiv.org/pdf/2206.01062
|
|
153
171
|
```
|
|
154
172
|
This will use MLX acceleration on supported Apple Silicon hardware.
|
|
155
173
|
|
|
@@ -216,3 +234,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
|
|
|
216
234
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
|
217
235
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
|
218
236
|
[integrations]: https://docling-project.github.io/docling/integrations/
|
|
237
|
+
[extraction]: https://docling-project.github.io/docling/examples/extraction/
|
|
@@ -21,6 +21,8 @@
|
|
|
21
21
|
[](https://opensource.org/licenses/MIT)
|
|
22
22
|
[](https://pepy.tech/projects/docling)
|
|
23
23
|
[](https://apify.com/vancura/docling)
|
|
24
|
+
[](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
|
|
25
|
+
[](https://docling.ai/discord)
|
|
24
26
|
[](https://www.bestpractices.dev/projects/10101)
|
|
25
27
|
[](https://lfaidata.foundation/projects/)
|
|
26
28
|
|
|
@@ -28,17 +30,24 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
|
28
30
|
|
|
29
31
|
## Features
|
|
30
32
|
|
|
31
|
-
* 🗂️
|
|
33
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
|
|
32
34
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
|
33
35
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
|
34
|
-
* ↪️
|
|
36
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
|
35
37
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
|
36
38
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
|
37
39
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
|
38
|
-
* 👓 Support of several Visual Language Models ([
|
|
39
|
-
* 🎙️
|
|
40
|
+
* 👓 Support of several Visual Language Models ([GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M))
|
|
41
|
+
* 🎙️ Audio support with Automatic Speech Recognition (ASR) models
|
|
42
|
+
* 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
|
|
40
43
|
* 💻 Simple and convenient CLI
|
|
41
44
|
|
|
45
|
+
### What's new
|
|
46
|
+
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
|
47
|
+
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
|
|
48
|
+
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
|
|
49
|
+
* 💬 Parsing of Web Video Text Tracks (WebVTT) files
|
|
50
|
+
|
|
42
51
|
### Coming soon
|
|
43
52
|
|
|
44
53
|
* 📝 Metadata extraction, including title, authors, references & language
|
|
@@ -69,7 +78,7 @@ result = converter.convert(source)
|
|
|
69
78
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
|
70
79
|
```
|
|
71
80
|
|
|
72
|
-
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
|
81
|
+
More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
|
|
73
82
|
the docs.
|
|
74
83
|
|
|
75
84
|
## CLI
|
|
@@ -80,9 +89,9 @@ Docling has a built-in CLI to run conversions.
|
|
|
80
89
|
docling https://arxiv.org/pdf/2206.01062
|
|
81
90
|
```
|
|
82
91
|
|
|
83
|
-
You can also use 🥚[
|
|
92
|
+
You can also use 🥚[GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M) and other VLMs via Docling CLI:
|
|
84
93
|
```bash
|
|
85
|
-
docling --pipeline vlm --vlm-model
|
|
94
|
+
docling --pipeline vlm --vlm-model granite_docling https://arxiv.org/pdf/2206.01062
|
|
86
95
|
```
|
|
87
96
|
This will use MLX acceleration on supported Apple Silicon hardware.
|
|
88
97
|
|
|
@@ -149,3 +158,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
|
|
|
149
158
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
|
150
159
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
|
151
160
|
[integrations]: https://docling-project.github.io/docling/integrations/
|
|
161
|
+
[extraction]: https://docling-project.github.io/docling/examples/extraction/
|
|
@@ -1,10 +1,16 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from io import BytesIO
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import TYPE_CHECKING,
|
|
4
|
+
from typing import TYPE_CHECKING, Union
|
|
5
5
|
|
|
6
6
|
from docling_core.types.doc import DoclingDocument
|
|
7
7
|
|
|
8
|
+
from docling.datamodel.backend_options import (
|
|
9
|
+
BackendOptions,
|
|
10
|
+
BaseBackendOptions,
|
|
11
|
+
DeclarativeBackendOptions,
|
|
12
|
+
)
|
|
13
|
+
|
|
8
14
|
if TYPE_CHECKING:
|
|
9
15
|
from docling.datamodel.base_models import InputFormat
|
|
10
16
|
from docling.datamodel.document import InputDocument
|
|
@@ -12,11 +18,17 @@ if TYPE_CHECKING:
|
|
|
12
18
|
|
|
13
19
|
class AbstractDocumentBackend(ABC):
|
|
14
20
|
@abstractmethod
|
|
15
|
-
def __init__(
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
in_doc: "InputDocument",
|
|
24
|
+
path_or_stream: Union[BytesIO, Path],
|
|
25
|
+
options: BaseBackendOptions = BaseBackendOptions(),
|
|
26
|
+
):
|
|
16
27
|
self.file = in_doc.file
|
|
17
28
|
self.path_or_stream = path_or_stream
|
|
18
29
|
self.document_hash = in_doc.document_hash
|
|
19
30
|
self.input_format = in_doc.format
|
|
31
|
+
self.options = options
|
|
20
32
|
|
|
21
33
|
@abstractmethod
|
|
22
34
|
def is_valid(self) -> bool:
|
|
@@ -35,7 +47,7 @@ class AbstractDocumentBackend(ABC):
|
|
|
35
47
|
|
|
36
48
|
@classmethod
|
|
37
49
|
@abstractmethod
|
|
38
|
-
def supported_formats(cls) ->
|
|
50
|
+
def supported_formats(cls) -> set["InputFormat"]:
|
|
39
51
|
pass
|
|
40
52
|
|
|
41
53
|
|
|
@@ -58,6 +70,15 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
|
|
|
58
70
|
straight without a recognition pipeline.
|
|
59
71
|
"""
|
|
60
72
|
|
|
73
|
+
@abstractmethod
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
in_doc: "InputDocument",
|
|
77
|
+
path_or_stream: Union[BytesIO, Path],
|
|
78
|
+
options: BackendOptions = DeclarativeBackendOptions(),
|
|
79
|
+
) -> None:
|
|
80
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
81
|
+
|
|
61
82
|
@abstractmethod
|
|
62
83
|
def convert(self) -> DoclingDocument:
|
|
63
84
|
pass
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import re
|
|
3
3
|
from io import BytesIO
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Final,
|
|
5
|
+
from typing import Final, Union
|
|
6
6
|
|
|
7
7
|
from docling_core.types.doc import (
|
|
8
8
|
DocItemLabel,
|
|
@@ -27,7 +27,7 @@ DEFAULT_IMAGE_HEIGHT: Final = 128
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
30
|
-
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
|
30
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
|
31
31
|
super().__init__(in_doc, path_or_stream)
|
|
32
32
|
|
|
33
33
|
self.path_or_stream = path_or_stream
|
|
@@ -58,7 +58,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
|
58
58
|
return
|
|
59
59
|
|
|
60
60
|
@classmethod
|
|
61
|
-
def supported_formats(cls) ->
|
|
61
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
62
62
|
return {InputFormat.ASCIIDOC}
|
|
63
63
|
|
|
64
64
|
def convert(self) -> DoclingDocument:
|
|
@@ -78,7 +78,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
|
78
78
|
|
|
79
79
|
return doc
|
|
80
80
|
|
|
81
|
-
def _parse(self, doc: DoclingDocument):
|
|
81
|
+
def _parse(self, doc: DoclingDocument):
|
|
82
82
|
"""
|
|
83
83
|
Main function that orchestrates the parsing by yielding components:
|
|
84
84
|
title, section headers, text, lists, and tables.
|
|
@@ -12,6 +12,7 @@ from PIL import Image
|
|
|
12
12
|
from pypdfium2 import PdfPage
|
|
13
13
|
|
|
14
14
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
15
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
15
16
|
from docling.datamodel.base_models import Size
|
|
16
17
|
from docling.utils.locks import pypdfium2_lock
|
|
17
18
|
|
|
@@ -22,15 +23,64 @@ _log = logging.getLogger(__name__)
|
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class DoclingParseV4PageBackend(PdfPageBackend):
|
|
25
|
-
def __init__(
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
*,
|
|
29
|
+
dp_doc: PdfDocument,
|
|
30
|
+
page_obj: PdfPage,
|
|
31
|
+
page_no: int,
|
|
32
|
+
create_words: bool = True,
|
|
33
|
+
create_textlines: bool = True,
|
|
34
|
+
keep_chars: bool = False,
|
|
35
|
+
keep_lines: bool = False,
|
|
36
|
+
keep_images: bool = True,
|
|
37
|
+
):
|
|
26
38
|
self._ppage = page_obj
|
|
27
|
-
self.
|
|
28
|
-
self.
|
|
39
|
+
self._dp_doc = dp_doc
|
|
40
|
+
self._page_no = page_no
|
|
41
|
+
|
|
42
|
+
self._create_words = create_words
|
|
43
|
+
self._create_textlines = create_textlines
|
|
44
|
+
|
|
45
|
+
self._keep_chars = keep_chars
|
|
46
|
+
self._keep_lines = keep_lines
|
|
47
|
+
self._keep_images = keep_images
|
|
48
|
+
|
|
49
|
+
self._dpage: Optional[SegmentedPdfPage] = None
|
|
50
|
+
self._unloaded = False
|
|
51
|
+
self.valid = (self._ppage is not None) and (self._dp_doc is not None)
|
|
52
|
+
|
|
53
|
+
def _ensure_parsed(self) -> None:
|
|
54
|
+
if self._dpage is not None:
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
seg_page = self._dp_doc.get_page(
|
|
58
|
+
self._page_no + 1,
|
|
59
|
+
keep_chars=self._keep_chars,
|
|
60
|
+
keep_lines=self._keep_lines,
|
|
61
|
+
keep_bitmaps=self._keep_images,
|
|
62
|
+
create_words=self._create_words,
|
|
63
|
+
create_textlines=self._create_textlines,
|
|
64
|
+
enforce_same_font=True,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# In Docling, all TextCell instances are expected with top-left origin.
|
|
68
|
+
[
|
|
69
|
+
tc.to_top_left_origin(seg_page.dimension.height)
|
|
70
|
+
for tc in seg_page.textline_cells
|
|
71
|
+
]
|
|
72
|
+
[tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.char_cells]
|
|
73
|
+
[tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.word_cells]
|
|
74
|
+
|
|
75
|
+
self._dpage = seg_page
|
|
29
76
|
|
|
30
77
|
def is_valid(self) -> bool:
|
|
31
78
|
return self.valid
|
|
32
79
|
|
|
33
80
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
|
81
|
+
self._ensure_parsed()
|
|
82
|
+
assert self._dpage is not None
|
|
83
|
+
|
|
34
84
|
# Find intersecting cells on the page
|
|
35
85
|
text_piece = ""
|
|
36
86
|
page_size = self.get_size()
|
|
@@ -56,12 +106,19 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
|
56
106
|
return text_piece
|
|
57
107
|
|
|
58
108
|
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
|
109
|
+
self._ensure_parsed()
|
|
59
110
|
return self._dpage
|
|
60
111
|
|
|
61
112
|
def get_text_cells(self) -> Iterable[TextCell]:
|
|
113
|
+
self._ensure_parsed()
|
|
114
|
+
assert self._dpage is not None
|
|
115
|
+
|
|
62
116
|
return self._dpage.textline_cells
|
|
63
117
|
|
|
64
118
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
|
119
|
+
self._ensure_parsed()
|
|
120
|
+
assert self._dpage is not None
|
|
121
|
+
|
|
65
122
|
AREA_THRESHOLD = 0 # 32 * 32
|
|
66
123
|
|
|
67
124
|
images = self._dpage.bitmap_resources
|
|
@@ -123,18 +180,33 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
|
123
180
|
# )
|
|
124
181
|
|
|
125
182
|
def unload(self):
|
|
183
|
+
if not self._unloaded and self._dp_doc is not None:
|
|
184
|
+
self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
|
|
185
|
+
self._unloaded = True
|
|
186
|
+
|
|
126
187
|
self._ppage = None
|
|
127
188
|
self._dpage = None
|
|
189
|
+
self._dp_doc = None
|
|
128
190
|
|
|
129
191
|
|
|
130
192
|
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|
131
|
-
def __init__(
|
|
132
|
-
|
|
133
|
-
|
|
193
|
+
def __init__(
|
|
194
|
+
self,
|
|
195
|
+
in_doc: "InputDocument",
|
|
196
|
+
path_or_stream: Union[BytesIO, Path],
|
|
197
|
+
options: PdfBackendOptions = PdfBackendOptions(),
|
|
198
|
+
):
|
|
199
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
200
|
+
|
|
201
|
+
password = (
|
|
202
|
+
self.options.password.get_secret_value() if self.options.password else None
|
|
203
|
+
)
|
|
134
204
|
with pypdfium2_lock:
|
|
135
|
-
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
|
205
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
|
|
136
206
|
self.parser = DoclingPdfParser(loglevel="fatal")
|
|
137
|
-
self.dp_doc: PdfDocument = self.parser.load(
|
|
207
|
+
self.dp_doc: PdfDocument = self.parser.load(
|
|
208
|
+
path_or_stream=self.path_or_stream, password=password
|
|
209
|
+
)
|
|
138
210
|
success = self.dp_doc is not None
|
|
139
211
|
|
|
140
212
|
if not success:
|
|
@@ -157,37 +229,32 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|
|
157
229
|
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
|
158
230
|
) -> DoclingParseV4PageBackend:
|
|
159
231
|
with pypdfium2_lock:
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
for tc in seg_page.textline_cells
|
|
170
|
-
]
|
|
171
|
-
[
|
|
172
|
-
tc.to_top_left_origin(seg_page.dimension.height)
|
|
173
|
-
for tc in seg_page.char_cells
|
|
174
|
-
]
|
|
175
|
-
[
|
|
176
|
-
tc.to_top_left_origin(seg_page.dimension.height)
|
|
177
|
-
for tc in seg_page.word_cells
|
|
178
|
-
]
|
|
179
|
-
|
|
180
|
-
return DoclingParseV4PageBackend(
|
|
181
|
-
seg_page,
|
|
182
|
-
self._pdoc[page_no],
|
|
183
|
-
)
|
|
232
|
+
ppage = self._pdoc[page_no]
|
|
233
|
+
|
|
234
|
+
return DoclingParseV4PageBackend(
|
|
235
|
+
dp_doc=self.dp_doc,
|
|
236
|
+
page_obj=ppage,
|
|
237
|
+
page_no=page_no,
|
|
238
|
+
create_words=create_words,
|
|
239
|
+
create_textlines=create_textlines,
|
|
240
|
+
)
|
|
184
241
|
|
|
185
242
|
def is_valid(self) -> bool:
|
|
186
243
|
return self.page_count() > 0
|
|
187
244
|
|
|
188
245
|
def unload(self):
|
|
189
246
|
super().unload()
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
self.
|
|
193
|
-
|
|
247
|
+
# Unload docling-parse document first
|
|
248
|
+
if self.dp_doc is not None:
|
|
249
|
+
self.dp_doc.unload()
|
|
250
|
+
self.dp_doc = None
|
|
251
|
+
|
|
252
|
+
# Then close pypdfium2 document with proper locking
|
|
253
|
+
if self._pdoc is not None:
|
|
254
|
+
with pypdfium2_lock:
|
|
255
|
+
try:
|
|
256
|
+
self._pdoc.close()
|
|
257
|
+
except Exception:
|
|
258
|
+
# Ignore cleanup errors
|
|
259
|
+
pass
|
|
260
|
+
self._pdoc = None
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from tempfile import mkdtemp
|
|
6
|
+
from typing import Callable, Optional
|
|
7
|
+
|
|
8
|
+
import pypdfium2
|
|
9
|
+
from docx.document import Document
|
|
10
|
+
from PIL import Image, ImageChops
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
|
|
14
|
+
"""Return the libreoffice cmd and optionally test it."""
|
|
15
|
+
|
|
16
|
+
libreoffice_cmd = (
|
|
17
|
+
shutil.which("libreoffice")
|
|
18
|
+
or shutil.which("soffice")
|
|
19
|
+
or (
|
|
20
|
+
"/Applications/LibreOffice.app/Contents/MacOS/soffice"
|
|
21
|
+
if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
|
|
22
|
+
else None
|
|
23
|
+
)
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if raise_if_unavailable:
|
|
27
|
+
if libreoffice_cmd is None:
|
|
28
|
+
raise RuntimeError("Libreoffice not found")
|
|
29
|
+
|
|
30
|
+
# The following test will raise if the libreoffice_cmd cannot be used
|
|
31
|
+
subprocess.run(
|
|
32
|
+
[
|
|
33
|
+
libreoffice_cmd,
|
|
34
|
+
"-h",
|
|
35
|
+
],
|
|
36
|
+
stdout=subprocess.DEVNULL,
|
|
37
|
+
stderr=subprocess.DEVNULL,
|
|
38
|
+
check=True,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
return libreoffice_cmd
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_docx_to_pdf_converter() -> Optional[Callable]:
|
|
45
|
+
"""
|
|
46
|
+
Detects the best available DOCX to PDF tool and returns a conversion function.
|
|
47
|
+
The returned function accepts (input_path, output_path).
|
|
48
|
+
Returns None if no tool is available.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
# Try LibreOffice
|
|
52
|
+
libreoffice_cmd = get_libreoffice_cmd()
|
|
53
|
+
|
|
54
|
+
if libreoffice_cmd:
|
|
55
|
+
|
|
56
|
+
def convert_with_libreoffice(input_path, output_path):
|
|
57
|
+
subprocess.run(
|
|
58
|
+
[
|
|
59
|
+
libreoffice_cmd,
|
|
60
|
+
"--headless",
|
|
61
|
+
"--convert-to",
|
|
62
|
+
"pdf",
|
|
63
|
+
"--outdir",
|
|
64
|
+
os.path.dirname(output_path),
|
|
65
|
+
input_path,
|
|
66
|
+
],
|
|
67
|
+
stdout=subprocess.DEVNULL,
|
|
68
|
+
stderr=subprocess.DEVNULL,
|
|
69
|
+
check=True,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
expected_output = os.path.join(
|
|
73
|
+
os.path.dirname(output_path),
|
|
74
|
+
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
|
|
75
|
+
)
|
|
76
|
+
if expected_output != output_path:
|
|
77
|
+
os.rename(expected_output, output_path)
|
|
78
|
+
|
|
79
|
+
return convert_with_libreoffice
|
|
80
|
+
|
|
81
|
+
## Space for other DOCX to PDF converters if available
|
|
82
|
+
|
|
83
|
+
# No tools found
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
|
|
88
|
+
if bg_color is None:
|
|
89
|
+
bg_color = image.getpixel((0, 0))
|
|
90
|
+
|
|
91
|
+
bg = Image.new(image.mode, image.size, bg_color)
|
|
92
|
+
diff = ImageChops.difference(image, bg)
|
|
93
|
+
bbox = diff.getbbox()
|
|
94
|
+
|
|
95
|
+
if bbox:
|
|
96
|
+
left, upper, right, lower = bbox
|
|
97
|
+
left = max(0, left - padding)
|
|
98
|
+
upper = max(0, upper - padding)
|
|
99
|
+
right = min(image.width, right + padding)
|
|
100
|
+
lower = min(image.height, lower + padding)
|
|
101
|
+
return image.crop((left, upper, right, lower))
|
|
102
|
+
else:
|
|
103
|
+
return image
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_pil_from_dml_docx(
|
|
107
|
+
docx: Document, converter: Optional[Callable]
|
|
108
|
+
) -> Optional[Image.Image]:
|
|
109
|
+
if converter is None:
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
temp_dir = Path(mkdtemp())
|
|
113
|
+
temp_docx = Path(temp_dir / "drawing_only.docx")
|
|
114
|
+
temp_pdf = Path(temp_dir / "drawing_only.pdf")
|
|
115
|
+
|
|
116
|
+
# 1) Save docx temporarily
|
|
117
|
+
docx.save(str(temp_docx))
|
|
118
|
+
|
|
119
|
+
# 2) Export to PDF
|
|
120
|
+
converter(temp_docx, temp_pdf)
|
|
121
|
+
|
|
122
|
+
# 3) Load PDF as PNG
|
|
123
|
+
pdf = pypdfium2.PdfDocument(temp_pdf)
|
|
124
|
+
page = pdf[0]
|
|
125
|
+
image = crop_whitespace(page.render(scale=2).to_pil())
|
|
126
|
+
page.close()
|
|
127
|
+
pdf.close()
|
|
128
|
+
|
|
129
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
130
|
+
|
|
131
|
+
return image
|
|
@@ -65,6 +65,11 @@ CHR_BO = {
|
|
|
65
65
|
"\u2210": "\\coprod",
|
|
66
66
|
"\u2211": "\\sum",
|
|
67
67
|
"\u222b": "\\int",
|
|
68
|
+
"\u222c": "\\iint",
|
|
69
|
+
"\u222d": "\\iiint",
|
|
70
|
+
"\u222e": "\\oint",
|
|
71
|
+
"\u222f": "\\oiint",
|
|
72
|
+
"\u2230": "\\oiiint",
|
|
68
73
|
"\u22c0": "\\bigwedge",
|
|
69
74
|
"\u22c1": "\\bigvee",
|
|
70
75
|
"\u22c2": "\\bigcap",
|
|
@@ -260,7 +260,15 @@ class oMath2Latex(Tag2Method):
|
|
|
260
260
|
the fraction object
|
|
261
261
|
"""
|
|
262
262
|
c_dict = self.process_children_dict(elm)
|
|
263
|
-
pr = c_dict
|
|
263
|
+
pr = c_dict.get("fPr")
|
|
264
|
+
if pr is None:
|
|
265
|
+
# Handle missing fPr element gracefully
|
|
266
|
+
_log.debug("Missing fPr element in fraction, using default formatting")
|
|
267
|
+
latex_s = F_DEFAULT
|
|
268
|
+
return latex_s.format(
|
|
269
|
+
num=c_dict.get("num"),
|
|
270
|
+
den=c_dict.get("den"),
|
|
271
|
+
)
|
|
264
272
|
latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
|
|
265
273
|
return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
|
|
266
274
|
|
|
@@ -373,7 +381,8 @@ class oMath2Latex(Tag2Method):
|
|
|
373
381
|
bo = ""
|
|
374
382
|
for stag, t, e in self.process_children_list(elm):
|
|
375
383
|
if stag == "naryPr":
|
|
376
|
-
|
|
384
|
+
# if <m:naryPr> contains no <m:chr>, the n-ary represents an integral
|
|
385
|
+
bo = get_val(t.chr, default="\\int", store=CHR_BO)
|
|
377
386
|
else:
|
|
378
387
|
res.append(t)
|
|
379
388
|
return bo + BLANK.join(res)
|