docling 2.57.0__tar.gz → 2.58.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- {docling-2.57.0 → docling-2.58.0}/PKG-INFO +4 -2
- {docling-2.57.0 → docling-2.58.0}/README.md +1 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/abstract_backend.py +24 -3
- {docling-2.57.0 → docling-2.58.0}/docling/backend/asciidoc_backend.py +3 -3
- {docling-2.57.0 → docling-2.58.0}/docling/backend/docling_parse_v4_backend.py +15 -4
- {docling-2.57.0 → docling-2.58.0}/docling/backend/html_backend.py +130 -20
- {docling-2.57.0 → docling-2.58.0}/docling/backend/md_backend.py +27 -5
- {docling-2.57.0 → docling-2.58.0}/docling/backend/msexcel_backend.py +115 -27
- {docling-2.57.0 → docling-2.58.0}/docling/backend/mspowerpoint_backend.py +2 -2
- {docling-2.57.0 → docling-2.58.0}/docling/backend/msword_backend.py +18 -18
- {docling-2.57.0 → docling-2.58.0}/docling/backend/pdf_backend.py +9 -2
- {docling-2.57.0 → docling-2.58.0}/docling/backend/pypdfium2_backend.py +12 -3
- {docling-2.57.0 → docling-2.58.0}/docling/cli/main.py +85 -30
- docling-2.58.0/docling/datamodel/asr_model_specs.py +494 -0
- docling-2.58.0/docling/datamodel/backend_options.py +82 -0
- {docling-2.57.0 → docling-2.58.0}/docling/datamodel/base_models.py +17 -2
- {docling-2.57.0 → docling-2.58.0}/docling/datamodel/document.py +81 -48
- {docling-2.57.0 → docling-2.58.0}/docling/datamodel/pipeline_options_asr_model.py +21 -1
- {docling-2.57.0 → docling-2.58.0}/docling/document_converter.py +37 -45
- {docling-2.57.0 → docling-2.58.0}/docling/document_extractor.py +12 -11
- {docling-2.57.0 → docling-2.58.0}/docling/models/readingorder_model.py +6 -7
- {docling-2.57.0 → docling-2.58.0}/docling/pipeline/asr_pipeline.py +139 -3
- {docling-2.57.0 → docling-2.58.0}/docling/utils/api_image_request.py +4 -4
- {docling-2.57.0 → docling-2.58.0}/docling/utils/layout_postprocessor.py +23 -24
- {docling-2.57.0 → docling-2.58.0}/docling.egg-info/PKG-INFO +4 -2
- {docling-2.57.0 → docling-2.58.0}/docling.egg-info/SOURCES.txt +3 -0
- {docling-2.57.0 → docling-2.58.0}/docling.egg-info/requires.txt +4 -1
- {docling-2.57.0 → docling-2.58.0}/pyproject.toml +3 -2
- docling-2.58.0/tests/test_asr_mlx_whisper.py +340 -0
- docling-2.58.0/tests/test_asr_pipeline.py +398 -0
- docling-2.58.0/tests/test_backend_html.py +443 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_markdown.py +1 -2
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_msexcel.py +65 -0
- docling-2.58.0/tests/test_cli.py +92 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_input_doc.py +42 -0
- docling-2.58.0/tests/test_interfaces.py +138 -0
- docling-2.58.0/tests/test_pdf_password.py +63 -0
- docling-2.57.0/docling/datamodel/asr_model_specs.py +0 -92
- docling-2.57.0/tests/test_asr_pipeline.py +0 -85
- docling-2.57.0/tests/test_backend_html.py +0 -213
- docling-2.57.0/tests/test_cli.py +0 -27
- docling-2.57.0/tests/test_interfaces.py +0 -63
- {docling-2.57.0 → docling-2.58.0}/LICENSE +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/docx/drawingml/utils.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/mets_gbs_backend.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/webvtt_backend.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/chunking/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/cli/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/cli/models.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/cli/tools.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/datamodel/extraction.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/datamodel/settings.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/exceptions.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/auto_ocr_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/base_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/layout_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/utils/generation_utils.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/pipeline/base_extraction_pipeline.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/pipeline/extraction_vlm_pipeline.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/py.typed +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/utils/__init__.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/utils/export.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/utils/locks.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/utils/orientation.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/utils/profiling.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/utils/utils.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling/utils/visualization.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.57.0 → docling-2.58.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.57.0 → docling-2.58.0}/setup.cfg +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_csv.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_jats.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_mets_gbs.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_msword.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_vtt.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_backend_webp.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_code_formula.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_extraction.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_invalid_input.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_options.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_settings_load.py +0 -0
- {docling-2.57.0 → docling-2.58.0}/tests/test_threaded_pipeline.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.58.0
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
|
|
|
27
27
|
License-File: LICENSE
|
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
29
29
|
Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.2
|
|
30
|
-
Requires-Dist: docling-parse<5.0.0,>=4.
|
|
30
|
+
Requires-Dist: docling-parse<5.0.0,>=4.7.0
|
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.9.1
|
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
|
33
33
|
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
|
@@ -69,6 +69,7 @@ Provides-Extra: rapidocr
|
|
|
69
69
|
Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
|
|
70
70
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
|
71
71
|
Provides-Extra: asr
|
|
72
|
+
Requires-Dist: mlx-whisper>=0.4.3; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "asr"
|
|
72
73
|
Requires-Dist: openai-whisper>=20250625; extra == "asr"
|
|
73
74
|
Dynamic: license-file
|
|
74
75
|
|
|
@@ -96,6 +97,7 @@ Dynamic: license-file
|
|
|
96
97
|
[](https://pepy.tech/projects/docling)
|
|
97
98
|
[](https://apify.com/vancura/docling)
|
|
98
99
|
[](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
|
|
100
|
+
[](https://docling.ai/discord)
|
|
99
101
|
[](https://www.bestpractices.dev/projects/10101)
|
|
100
102
|
[](https://lfaidata.foundation/projects/)
|
|
101
103
|
|
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
[](https://pepy.tech/projects/docling)
|
|
23
23
|
[](https://apify.com/vancura/docling)
|
|
24
24
|
[](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
|
|
25
|
+
[](https://docling.ai/discord)
|
|
25
26
|
[](https://www.bestpractices.dev/projects/10101)
|
|
26
27
|
[](https://lfaidata.foundation/projects/)
|
|
27
28
|
|
|
@@ -1,10 +1,16 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from io import BytesIO
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import TYPE_CHECKING,
|
|
4
|
+
from typing import TYPE_CHECKING, Union
|
|
5
5
|
|
|
6
6
|
from docling_core.types.doc import DoclingDocument
|
|
7
7
|
|
|
8
|
+
from docling.datamodel.backend_options import (
|
|
9
|
+
BackendOptions,
|
|
10
|
+
BaseBackendOptions,
|
|
11
|
+
DeclarativeBackendOptions,
|
|
12
|
+
)
|
|
13
|
+
|
|
8
14
|
if TYPE_CHECKING:
|
|
9
15
|
from docling.datamodel.base_models import InputFormat
|
|
10
16
|
from docling.datamodel.document import InputDocument
|
|
@@ -12,11 +18,17 @@ if TYPE_CHECKING:
|
|
|
12
18
|
|
|
13
19
|
class AbstractDocumentBackend(ABC):
|
|
14
20
|
@abstractmethod
|
|
15
|
-
def __init__(
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
in_doc: "InputDocument",
|
|
24
|
+
path_or_stream: Union[BytesIO, Path],
|
|
25
|
+
options: BaseBackendOptions = BaseBackendOptions(),
|
|
26
|
+
):
|
|
16
27
|
self.file = in_doc.file
|
|
17
28
|
self.path_or_stream = path_or_stream
|
|
18
29
|
self.document_hash = in_doc.document_hash
|
|
19
30
|
self.input_format = in_doc.format
|
|
31
|
+
self.options = options
|
|
20
32
|
|
|
21
33
|
@abstractmethod
|
|
22
34
|
def is_valid(self) -> bool:
|
|
@@ -35,7 +47,7 @@ class AbstractDocumentBackend(ABC):
|
|
|
35
47
|
|
|
36
48
|
@classmethod
|
|
37
49
|
@abstractmethod
|
|
38
|
-
def supported_formats(cls) ->
|
|
50
|
+
def supported_formats(cls) -> set["InputFormat"]:
|
|
39
51
|
pass
|
|
40
52
|
|
|
41
53
|
|
|
@@ -58,6 +70,15 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
|
|
|
58
70
|
straight without a recognition pipeline.
|
|
59
71
|
"""
|
|
60
72
|
|
|
73
|
+
@abstractmethod
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
in_doc: "InputDocument",
|
|
77
|
+
path_or_stream: Union[BytesIO, Path],
|
|
78
|
+
options: BackendOptions = DeclarativeBackendOptions(),
|
|
79
|
+
) -> None:
|
|
80
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
81
|
+
|
|
61
82
|
@abstractmethod
|
|
62
83
|
def convert(self) -> DoclingDocument:
|
|
63
84
|
pass
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import re
|
|
3
3
|
from io import BytesIO
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Final,
|
|
5
|
+
from typing import Final, Union
|
|
6
6
|
|
|
7
7
|
from docling_core.types.doc import (
|
|
8
8
|
DocItemLabel,
|
|
@@ -27,7 +27,7 @@ DEFAULT_IMAGE_HEIGHT: Final = 128
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
30
|
-
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
|
30
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
|
31
31
|
super().__init__(in_doc, path_or_stream)
|
|
32
32
|
|
|
33
33
|
self.path_or_stream = path_or_stream
|
|
@@ -58,7 +58,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
|
58
58
|
return
|
|
59
59
|
|
|
60
60
|
@classmethod
|
|
61
|
-
def supported_formats(cls) ->
|
|
61
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
62
62
|
return {InputFormat.ASCIIDOC}
|
|
63
63
|
|
|
64
64
|
def convert(self) -> DoclingDocument:
|
|
@@ -12,6 +12,7 @@ from PIL import Image
|
|
|
12
12
|
from pypdfium2 import PdfPage
|
|
13
13
|
|
|
14
14
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
15
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
15
16
|
from docling.datamodel.base_models import Size
|
|
16
17
|
from docling.utils.locks import pypdfium2_lock
|
|
17
18
|
|
|
@@ -189,13 +190,23 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
|
189
190
|
|
|
190
191
|
|
|
191
192
|
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|
192
|
-
def __init__(
|
|
193
|
-
|
|
193
|
+
def __init__(
|
|
194
|
+
self,
|
|
195
|
+
in_doc: "InputDocument",
|
|
196
|
+
path_or_stream: Union[BytesIO, Path],
|
|
197
|
+
options: PdfBackendOptions = PdfBackendOptions(),
|
|
198
|
+
):
|
|
199
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
194
200
|
|
|
201
|
+
password = (
|
|
202
|
+
self.options.password.get_secret_value() if self.options.password else None
|
|
203
|
+
)
|
|
195
204
|
with pypdfium2_lock:
|
|
196
|
-
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
|
205
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
|
|
197
206
|
self.parser = DoclingPdfParser(loglevel="fatal")
|
|
198
|
-
self.dp_doc: PdfDocument = self.parser.load(
|
|
207
|
+
self.dp_doc: PdfDocument = self.parser.load(
|
|
208
|
+
path_or_stream=self.path_or_stream, password=password
|
|
209
|
+
)
|
|
199
210
|
success = self.dp_doc is not None
|
|
200
211
|
|
|
201
212
|
if not success:
|
|
@@ -1,13 +1,16 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
import logging
|
|
3
|
+
import os
|
|
2
4
|
import re
|
|
3
|
-
import
|
|
5
|
+
import warnings
|
|
4
6
|
from contextlib import contextmanager
|
|
5
7
|
from copy import deepcopy
|
|
6
8
|
from io import BytesIO
|
|
7
9
|
from pathlib import Path
|
|
8
10
|
from typing import Final, Optional, Union, cast
|
|
9
|
-
from urllib.parse import urljoin
|
|
11
|
+
from urllib.parse import urljoin, urlparse
|
|
10
12
|
|
|
13
|
+
import requests
|
|
11
14
|
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
|
12
15
|
from bs4.element import PreformattedString
|
|
13
16
|
from docling_core.types.doc import (
|
|
@@ -17,6 +20,7 @@ from docling_core.types.doc import (
|
|
|
17
20
|
DocumentOrigin,
|
|
18
21
|
GroupItem,
|
|
19
22
|
GroupLabel,
|
|
23
|
+
PictureItem,
|
|
20
24
|
RefItem,
|
|
21
25
|
RichTableCell,
|
|
22
26
|
TableCell,
|
|
@@ -24,13 +28,18 @@ from docling_core.types.doc import (
|
|
|
24
28
|
TableItem,
|
|
25
29
|
TextItem,
|
|
26
30
|
)
|
|
27
|
-
from docling_core.types.doc.document import ContentLayer, Formatting, Script
|
|
31
|
+
from docling_core.types.doc.document import ContentLayer, Formatting, ImageRef, Script
|
|
32
|
+
from PIL import Image, UnidentifiedImageError
|
|
28
33
|
from pydantic import AnyUrl, BaseModel, ValidationError
|
|
29
34
|
from typing_extensions import override
|
|
30
35
|
|
|
31
|
-
from docling.backend.abstract_backend import
|
|
36
|
+
from docling.backend.abstract_backend import (
|
|
37
|
+
DeclarativeDocumentBackend,
|
|
38
|
+
)
|
|
39
|
+
from docling.datamodel.backend_options import HTMLBackendOptions
|
|
32
40
|
from docling.datamodel.base_models import InputFormat
|
|
33
41
|
from docling.datamodel.document import InputDocument
|
|
42
|
+
from docling.exceptions import OperationNotAllowed
|
|
34
43
|
|
|
35
44
|
_log = logging.getLogger(__name__)
|
|
36
45
|
|
|
@@ -43,6 +52,7 @@ _BLOCK_TAGS: Final = {
|
|
|
43
52
|
"details",
|
|
44
53
|
"figure",
|
|
45
54
|
"footer",
|
|
55
|
+
"img",
|
|
46
56
|
"h1",
|
|
47
57
|
"h2",
|
|
48
58
|
"h3",
|
|
@@ -186,11 +196,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
186
196
|
self,
|
|
187
197
|
in_doc: InputDocument,
|
|
188
198
|
path_or_stream: Union[BytesIO, Path],
|
|
189
|
-
|
|
199
|
+
options: HTMLBackendOptions = HTMLBackendOptions(),
|
|
190
200
|
):
|
|
191
|
-
super().__init__(in_doc, path_or_stream)
|
|
201
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
192
202
|
self.soup: Optional[Tag] = None
|
|
193
|
-
self.path_or_stream = path_or_stream
|
|
203
|
+
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
|
|
204
|
+
self.base_path: Optional[str] = str(options.source_uri)
|
|
194
205
|
|
|
195
206
|
# Initialize the parents for the hierarchy
|
|
196
207
|
self.max_levels = 10
|
|
@@ -200,7 +211,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
200
211
|
for i in range(self.max_levels):
|
|
201
212
|
self.parents[i] = None
|
|
202
213
|
self.hyperlink: Union[AnyUrl, Path, None] = None
|
|
203
|
-
self.original_url = original_url
|
|
204
214
|
self.format_tags: list[str] = []
|
|
205
215
|
|
|
206
216
|
try:
|
|
@@ -261,7 +271,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
261
271
|
content_layer=ContentLayer.FURNITURE,
|
|
262
272
|
)
|
|
263
273
|
# remove script and style tags
|
|
264
|
-
for tag in self.soup(["script", "style"]):
|
|
274
|
+
for tag in self.soup(["script", "noscript", "style"]):
|
|
265
275
|
tag.decompose()
|
|
266
276
|
# remove any hidden tag
|
|
267
277
|
for tag in self.soup(hidden=True):
|
|
@@ -291,6 +301,28 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
291
301
|
self._walk(content, doc)
|
|
292
302
|
return doc
|
|
293
303
|
|
|
304
|
+
@staticmethod
|
|
305
|
+
def _is_remote_url(value: str) -> bool:
|
|
306
|
+
parsed = urlparse(value)
|
|
307
|
+
return parsed.scheme in {"http", "https", "ftp", "s3", "gs"}
|
|
308
|
+
|
|
309
|
+
def _resolve_relative_path(self, loc: str) -> str:
|
|
310
|
+
abs_loc = loc
|
|
311
|
+
|
|
312
|
+
if self.base_path:
|
|
313
|
+
if loc.startswith("//"):
|
|
314
|
+
# Protocol-relative URL - default to https
|
|
315
|
+
abs_loc = "https:" + loc
|
|
316
|
+
elif not loc.startswith(("http://", "https://", "data:", "file://")):
|
|
317
|
+
if HTMLDocumentBackend._is_remote_url(self.base_path): # remote fetch
|
|
318
|
+
abs_loc = urljoin(self.base_path, loc)
|
|
319
|
+
elif self.base_path: # local fetch
|
|
320
|
+
# For local files, resolve relative to the HTML file location
|
|
321
|
+
abs_loc = str(Path(self.base_path).parent / loc)
|
|
322
|
+
|
|
323
|
+
_log.debug(f"Resolved location {loc} to {abs_loc}")
|
|
324
|
+
return abs_loc
|
|
325
|
+
|
|
294
326
|
@staticmethod
|
|
295
327
|
def group_cell_elements(
|
|
296
328
|
group_name: str,
|
|
@@ -520,7 +552,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
520
552
|
if name == "img":
|
|
521
553
|
flush_buffer()
|
|
522
554
|
im_ref3 = self._emit_image(node, doc)
|
|
523
|
-
|
|
555
|
+
if im_ref3:
|
|
556
|
+
added_refs.append(im_ref3)
|
|
524
557
|
elif name in _FORMAT_TAG_MAP:
|
|
525
558
|
with self._use_format([name]):
|
|
526
559
|
wk = self._walk(node, doc)
|
|
@@ -669,8 +702,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
669
702
|
else:
|
|
670
703
|
if isinstance(this_href, str) and this_href:
|
|
671
704
|
old_hyperlink = self.hyperlink
|
|
672
|
-
|
|
673
|
-
this_href = urljoin(str(self.original_url), str(this_href))
|
|
705
|
+
this_href = self._resolve_relative_path(this_href)
|
|
674
706
|
# ugly fix for relative links since pydantic does not support them.
|
|
675
707
|
try:
|
|
676
708
|
new_hyperlink = AnyUrl(this_href)
|
|
@@ -837,7 +869,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
837
869
|
for img_tag in tag("img"):
|
|
838
870
|
if isinstance(img_tag, Tag):
|
|
839
871
|
im_ref = self._emit_image(img_tag, doc)
|
|
840
|
-
|
|
872
|
+
if im_ref:
|
|
873
|
+
added_ref.append(im_ref)
|
|
841
874
|
return added_ref
|
|
842
875
|
|
|
843
876
|
def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
|
|
@@ -1003,7 +1036,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1003
1036
|
img_tag = tag.find("img")
|
|
1004
1037
|
if isinstance(img_tag, Tag):
|
|
1005
1038
|
im_ref = self._emit_image(img_tag, doc)
|
|
1006
|
-
|
|
1039
|
+
if im_ref is not None:
|
|
1040
|
+
added_refs.append(im_ref)
|
|
1007
1041
|
|
|
1008
1042
|
elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
|
1009
1043
|
heading_refs = self._handle_heading(tag, doc)
|
|
@@ -1061,7 +1095,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1061
1095
|
for img_tag in tag("img"):
|
|
1062
1096
|
if isinstance(img_tag, Tag):
|
|
1063
1097
|
im_ref2 = self._emit_image(tag, doc)
|
|
1064
|
-
|
|
1098
|
+
if im_ref2 is not None:
|
|
1099
|
+
added_refs.append(im_ref2)
|
|
1065
1100
|
|
|
1066
1101
|
elif tag_name in {"pre"}:
|
|
1067
1102
|
# handle monospace code snippets (pre).
|
|
@@ -1092,10 +1127,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1092
1127
|
self._walk(tag, doc)
|
|
1093
1128
|
return added_refs
|
|
1094
1129
|
|
|
1095
|
-
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
|
|
1130
|
+
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> Optional[RefItem]:
|
|
1096
1131
|
figure = img_tag.find_parent("figure")
|
|
1097
1132
|
caption: AnnotatedTextList = AnnotatedTextList()
|
|
1098
1133
|
|
|
1134
|
+
parent = self.parents[self.level]
|
|
1135
|
+
|
|
1099
1136
|
# check if the figure has a link - this is HACK:
|
|
1100
1137
|
def get_img_hyperlink(img_tag):
|
|
1101
1138
|
this_parent = img_tag.parent
|
|
@@ -1106,9 +1143,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1106
1143
|
return None
|
|
1107
1144
|
|
|
1108
1145
|
if img_hyperlink := get_img_hyperlink(img_tag):
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
)
|
|
1146
|
+
img_text = img_tag.get("alt") or ""
|
|
1147
|
+
caption.append(AnnotatedText(text=img_text, hyperlink=img_hyperlink))
|
|
1112
1148
|
|
|
1113
1149
|
if isinstance(figure, Tag):
|
|
1114
1150
|
caption_tag = figure.find("figcaption", recursive=False)
|
|
@@ -1135,13 +1171,78 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1135
1171
|
hyperlink=caption_anno_text.hyperlink,
|
|
1136
1172
|
)
|
|
1137
1173
|
|
|
1174
|
+
src_loc: str = self._get_attr_as_string(img_tag, "src")
|
|
1175
|
+
if not cast(HTMLBackendOptions, self.options).fetch_images or not src_loc:
|
|
1176
|
+
# Do not fetch the image, just add a placeholder
|
|
1177
|
+
placeholder: PictureItem = doc.add_picture(
|
|
1178
|
+
caption=caption_item,
|
|
1179
|
+
parent=parent,
|
|
1180
|
+
content_layer=self.content_layer,
|
|
1181
|
+
)
|
|
1182
|
+
return placeholder.get_ref()
|
|
1183
|
+
|
|
1184
|
+
src_loc = self._resolve_relative_path(src_loc)
|
|
1185
|
+
img_ref = self._create_image_ref(src_loc)
|
|
1186
|
+
|
|
1138
1187
|
docling_pic = doc.add_picture(
|
|
1188
|
+
image=img_ref,
|
|
1139
1189
|
caption=caption_item,
|
|
1140
|
-
parent=
|
|
1190
|
+
parent=parent,
|
|
1141
1191
|
content_layer=self.content_layer,
|
|
1142
1192
|
)
|
|
1143
1193
|
return docling_pic.get_ref()
|
|
1144
1194
|
|
|
1195
|
+
def _create_image_ref(self, src_url: str) -> Optional[ImageRef]:
|
|
1196
|
+
try:
|
|
1197
|
+
img_data = self._load_image_data(src_url)
|
|
1198
|
+
if img_data:
|
|
1199
|
+
img = Image.open(BytesIO(img_data))
|
|
1200
|
+
return ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
|
|
1201
|
+
except (
|
|
1202
|
+
requests.HTTPError,
|
|
1203
|
+
ValidationError,
|
|
1204
|
+
UnidentifiedImageError,
|
|
1205
|
+
OperationNotAllowed,
|
|
1206
|
+
TypeError,
|
|
1207
|
+
ValueError,
|
|
1208
|
+
) as e:
|
|
1209
|
+
warnings.warn(f"Could not process an image from {src_url}: {e}")
|
|
1210
|
+
|
|
1211
|
+
return None
|
|
1212
|
+
|
|
1213
|
+
def _load_image_data(self, src_loc: str) -> Optional[bytes]:
|
|
1214
|
+
if src_loc.lower().endswith(".svg"):
|
|
1215
|
+
_log.debug(f"Skipping SVG file: {src_loc}")
|
|
1216
|
+
return None
|
|
1217
|
+
|
|
1218
|
+
if HTMLDocumentBackend._is_remote_url(src_loc):
|
|
1219
|
+
if not self.options.enable_remote_fetch:
|
|
1220
|
+
raise OperationNotAllowed(
|
|
1221
|
+
"Fetching remote resources is only allowed when set explicitly. "
|
|
1222
|
+
"Set options.enable_remote_fetch=True."
|
|
1223
|
+
)
|
|
1224
|
+
response = requests.get(src_loc, stream=True)
|
|
1225
|
+
response.raise_for_status()
|
|
1226
|
+
return response.content
|
|
1227
|
+
elif src_loc.startswith("data:"):
|
|
1228
|
+
data = re.sub(r"^data:image/.+;base64,", "", src_loc)
|
|
1229
|
+
return base64.b64decode(data)
|
|
1230
|
+
|
|
1231
|
+
if src_loc.startswith("file://"):
|
|
1232
|
+
src_loc = src_loc[7:]
|
|
1233
|
+
|
|
1234
|
+
if not self.options.enable_local_fetch:
|
|
1235
|
+
raise OperationNotAllowed(
|
|
1236
|
+
"Fetching local resources is only allowed when set explicitly. "
|
|
1237
|
+
"Set options.enable_local_fetch=True."
|
|
1238
|
+
)
|
|
1239
|
+
# add check that file exists and can read
|
|
1240
|
+
if os.path.isfile(src_loc) and os.access(src_loc, os.R_OK):
|
|
1241
|
+
with open(src_loc, "rb") as f:
|
|
1242
|
+
return f.read()
|
|
1243
|
+
else:
|
|
1244
|
+
raise ValueError("File does not exist or it is not readable.")
|
|
1245
|
+
|
|
1145
1246
|
@staticmethod
|
|
1146
1247
|
def get_text(item: PageElement) -> str:
|
|
1147
1248
|
"""Concatenate all child strings of a PageElement.
|
|
@@ -1238,3 +1339,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1238
1339
|
)
|
|
1239
1340
|
|
|
1240
1341
|
return int_spans
|
|
1342
|
+
|
|
1343
|
+
@staticmethod
|
|
1344
|
+
def _get_attr_as_string(tag: Tag, attr: str, default: str = "") -> str:
|
|
1345
|
+
"""Get attribute value as string, handling list values."""
|
|
1346
|
+
value = tag.get(attr)
|
|
1347
|
+
if not value:
|
|
1348
|
+
return default
|
|
1349
|
+
|
|
1350
|
+
return value[0] if isinstance(value, list) else value
|
|
@@ -24,10 +24,16 @@ from docling_core.types.doc import (
|
|
|
24
24
|
from docling_core.types.doc.document import Formatting
|
|
25
25
|
from marko import Markdown
|
|
26
26
|
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
|
|
27
|
-
from typing_extensions import Annotated
|
|
27
|
+
from typing_extensions import Annotated, override
|
|
28
28
|
|
|
29
|
-
from docling.backend.abstract_backend import
|
|
29
|
+
from docling.backend.abstract_backend import (
|
|
30
|
+
DeclarativeDocumentBackend,
|
|
31
|
+
)
|
|
30
32
|
from docling.backend.html_backend import HTMLDocumentBackend
|
|
33
|
+
from docling.datamodel.backend_options import (
|
|
34
|
+
HTMLBackendOptions,
|
|
35
|
+
MarkdownBackendOptions,
|
|
36
|
+
)
|
|
31
37
|
from docling.datamodel.base_models import InputFormat
|
|
32
38
|
from docling.datamodel.document import InputDocument
|
|
33
39
|
|
|
@@ -88,8 +94,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
|
88
94
|
|
|
89
95
|
return shortened_text
|
|
90
96
|
|
|
91
|
-
|
|
92
|
-
|
|
97
|
+
@override
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
in_doc: InputDocument,
|
|
101
|
+
path_or_stream: Union[BytesIO, Path],
|
|
102
|
+
options: MarkdownBackendOptions = MarkdownBackendOptions(),
|
|
103
|
+
):
|
|
104
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
93
105
|
|
|
94
106
|
_log.debug("Starting MarkdownDocumentBackend...")
|
|
95
107
|
|
|
@@ -575,14 +587,24 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
|
575
587
|
self._html_blocks = 0
|
|
576
588
|
# delegate to HTML backend
|
|
577
589
|
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
|
590
|
+
md_options = cast(MarkdownBackendOptions, self.options)
|
|
591
|
+
html_options = HTMLBackendOptions(
|
|
592
|
+
enable_local_fetch=md_options.enable_local_fetch,
|
|
593
|
+
enable_remote_fetch=md_options.enable_remote_fetch,
|
|
594
|
+
fetch_images=md_options.fetch_images,
|
|
595
|
+
source_uri=md_options.source_uri,
|
|
596
|
+
)
|
|
578
597
|
in_doc = InputDocument(
|
|
579
598
|
path_or_stream=stream,
|
|
580
599
|
format=InputFormat.HTML,
|
|
581
600
|
backend=html_backend_cls,
|
|
582
601
|
filename=self.file.name,
|
|
602
|
+
backend_options=html_options,
|
|
583
603
|
)
|
|
584
604
|
html_backend_obj = html_backend_cls(
|
|
585
|
-
in_doc=in_doc,
|
|
605
|
+
in_doc=in_doc,
|
|
606
|
+
path_or_stream=stream,
|
|
607
|
+
options=html_options,
|
|
586
608
|
)
|
|
587
609
|
doc = html_backend_obj.convert()
|
|
588
610
|
else:
|