docling 2.47.1__tar.gz → 2.49.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.47.1 → docling-2.49.0}/PKG-INFO +5 -2
- {docling-2.47.1 → docling-2.49.0}/docling/backend/html_backend.py +172 -76
- {docling-2.47.1 → docling-2.49.0}/docling/backend/msexcel_backend.py +15 -1
- {docling-2.47.1 → docling-2.49.0}/docling/backend/pypdfium2_backend.py +24 -2
- {docling-2.47.1 → docling-2.49.0}/docling/datamodel/base_models.py +13 -1
- {docling-2.47.1 → docling-2.49.0}/docling/datamodel/document.py +5 -3
- docling-2.49.0/docling/datamodel/extraction.py +39 -0
- {docling-2.47.1 → docling-2.49.0}/docling/datamodel/pipeline_options.py +12 -4
- {docling-2.47.1 → docling-2.49.0}/docling/datamodel/vlm_model_specs.py +17 -0
- {docling-2.47.1 → docling-2.49.0}/docling/document_converter.py +3 -6
- docling-2.49.0/docling/document_extractor.py +325 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/rapid_ocr_model.py +38 -16
- docling-2.49.0/docling/models/vlm_models_inline/nuextract_transformers_model.py +290 -0
- docling-2.49.0/docling/pipeline/base_extraction_pipeline.py +58 -0
- docling-2.49.0/docling/pipeline/extraction_vlm_pipeline.py +204 -0
- {docling-2.47.1 → docling-2.49.0}/docling.egg-info/PKG-INFO +5 -2
- {docling-2.47.1 → docling-2.49.0}/docling.egg-info/SOURCES.txt +6 -0
- {docling-2.47.1 → docling-2.49.0}/docling.egg-info/requires.txt +5 -2
- {docling-2.47.1 → docling-2.49.0}/pyproject.toml +6 -2
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_msexcel.py +3 -2
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_pdfium.py +19 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_webp.py +2 -2
- {docling-2.47.1 → docling-2.49.0}/tests/test_e2e_ocr_conversion.py +2 -1
- docling-2.49.0/tests/test_extraction.py +108 -0
- {docling-2.47.1 → docling-2.49.0}/LICENSE +0 -0
- {docling-2.47.1 → docling-2.49.0}/README.md +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/md_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/mets_gbs_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/chunking/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/cli/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/cli/main.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/cli/models.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/cli/tools.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/datamodel/settings.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/exceptions.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/base_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/layout_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/py.typed +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/__init__.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/export.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/locks.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/orientation.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/profiling.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/utils.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling/utils/visualization.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.47.1 → docling-2.49.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.47.1 → docling-2.49.0}/setup.cfg +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_csv.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_html.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_jats.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_mets_gbs.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_msword.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_cli.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_code_formula.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_input_doc.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_interfaces.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_invalid_input.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_options.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_settings_load.py +0 -0
- {docling-2.47.1 → docling-2.49.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.49.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -51,6 +51,7 @@ Requires-Dist: pluggy<2.0.0,>=1.0.0
|
|
51
51
|
Requires-Dist: pylatexenc<3.0,>=2.10
|
52
52
|
Requires-Dist: scipy<2.0.0,>=1.6.0
|
53
53
|
Requires-Dist: accelerate<2,>=1.0.0
|
54
|
+
Requires-Dist: polyfactory>=2.22.2
|
54
55
|
Provides-Extra: tesserocr
|
55
56
|
Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
|
56
57
|
Provides-Extra: ocrmac
|
@@ -60,9 +61,11 @@ Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
|
60
61
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
61
62
|
Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
62
63
|
Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
|
64
|
+
Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
|
63
65
|
Provides-Extra: rapidocr
|
64
|
-
Requires-Dist: rapidocr
|
66
|
+
Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
|
65
67
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
68
|
+
Requires-Dist: modelscope>=1.29.0; extra == "rapidocr"
|
66
69
|
Provides-Extra: asr
|
67
70
|
Requires-Dist: openai-whisper>=20250625; extra == "asr"
|
68
71
|
Dynamic: license-file
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
|
+
import traceback
|
3
4
|
from contextlib import contextmanager
|
4
5
|
from copy import deepcopy
|
5
6
|
from io import BytesIO
|
@@ -45,20 +46,22 @@ _BLOCK_TAGS: Final = {
|
|
45
46
|
"h4",
|
46
47
|
"h5",
|
47
48
|
"h6",
|
49
|
+
"ol",
|
48
50
|
"p",
|
49
51
|
"pre",
|
50
|
-
"code",
|
51
|
-
"ul",
|
52
|
-
"ol",
|
53
52
|
"summary",
|
54
53
|
"table",
|
54
|
+
"ul",
|
55
55
|
}
|
56
56
|
|
57
|
+
_CODE_TAG_SET: Final = {"code", "kbd", "samp"}
|
58
|
+
|
57
59
|
_FORMAT_TAG_MAP: Final = {
|
58
60
|
"b": {"bold": True},
|
59
61
|
"strong": {"bold": True},
|
60
62
|
"i": {"italic": True},
|
61
63
|
"em": {"italic": True},
|
64
|
+
"var": {"italic": True},
|
62
65
|
# "mark",
|
63
66
|
# "small",
|
64
67
|
"s": {"strikethrough": True},
|
@@ -67,6 +70,7 @@ _FORMAT_TAG_MAP: Final = {
|
|
67
70
|
"ins": {"underline": True},
|
68
71
|
"sub": {"script": Script.SUB},
|
69
72
|
"sup": {"script": Script.SUPER},
|
73
|
+
**{k: {} for k in _CODE_TAG_SET},
|
70
74
|
}
|
71
75
|
|
72
76
|
|
@@ -79,6 +83,7 @@ class AnnotatedText(BaseModel):
|
|
79
83
|
text: str
|
80
84
|
hyperlink: Union[AnyUrl, Path, None] = None
|
81
85
|
formatting: Union[Formatting, None] = None
|
86
|
+
code: bool = False
|
82
87
|
|
83
88
|
|
84
89
|
class AnnotatedTextList(list):
|
@@ -86,10 +91,12 @@ class AnnotatedTextList(list):
|
|
86
91
|
current_h = None
|
87
92
|
current_text = ""
|
88
93
|
current_f = None
|
94
|
+
current_code = False
|
89
95
|
for at in self:
|
90
96
|
t = at.text
|
91
97
|
h = at.hyperlink
|
92
98
|
f = at.formatting
|
99
|
+
c = at.code
|
93
100
|
current_text += t.strip() + " "
|
94
101
|
if f is not None and current_f is None:
|
95
102
|
current_f = f
|
@@ -103,8 +110,13 @@ class AnnotatedTextList(list):
|
|
103
110
|
_log.warning(
|
104
111
|
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
|
105
112
|
)
|
113
|
+
current_code = c if c else current_code
|
114
|
+
|
106
115
|
return AnnotatedText(
|
107
|
-
text=current_text.strip(),
|
116
|
+
text=current_text.strip(),
|
117
|
+
hyperlink=current_h,
|
118
|
+
formatting=current_f,
|
119
|
+
code=current_code,
|
108
120
|
)
|
109
121
|
|
110
122
|
def simplify_text_elements(self) -> "AnnotatedTextList":
|
@@ -114,9 +126,14 @@ class AnnotatedTextList(list):
|
|
114
126
|
text = self[0].text
|
115
127
|
hyperlink = self[0].hyperlink
|
116
128
|
formatting = self[0].formatting
|
129
|
+
code = self[0].code
|
117
130
|
last_elm = text
|
118
131
|
for i in range(1, len(self)):
|
119
|
-
if
|
132
|
+
if (
|
133
|
+
hyperlink == self[i].hyperlink
|
134
|
+
and formatting == self[i].formatting
|
135
|
+
and code == self[i].code
|
136
|
+
):
|
120
137
|
sep = " "
|
121
138
|
if not self[i].text.strip() or not last_elm.strip():
|
122
139
|
sep = ""
|
@@ -124,15 +141,20 @@ class AnnotatedTextList(list):
|
|
124
141
|
last_elm = self[i].text
|
125
142
|
else:
|
126
143
|
simplified.append(
|
127
|
-
AnnotatedText(
|
144
|
+
AnnotatedText(
|
145
|
+
text=text, hyperlink=hyperlink, formatting=formatting, code=code
|
146
|
+
)
|
128
147
|
)
|
129
148
|
text = self[i].text
|
130
149
|
last_elm = text
|
131
150
|
hyperlink = self[i].hyperlink
|
132
151
|
formatting = self[i].formatting
|
152
|
+
code = self[i].code
|
133
153
|
if text:
|
134
154
|
simplified.append(
|
135
|
-
AnnotatedText(
|
155
|
+
AnnotatedText(
|
156
|
+
text=text, hyperlink=hyperlink, formatting=formatting, code=code
|
157
|
+
)
|
136
158
|
)
|
137
159
|
return simplified
|
138
160
|
|
@@ -174,7 +196,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
174
196
|
self.ctx = _Context()
|
175
197
|
for i in range(self.max_levels):
|
176
198
|
self.parents[i] = None
|
177
|
-
self.hyperlink = None
|
199
|
+
self.hyperlink: Union[AnyUrl, Path, None] = None
|
178
200
|
self.original_url = original_url
|
179
201
|
self.format_tags: list[str] = []
|
180
202
|
|
@@ -235,9 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
235
257
|
orig=title_text,
|
236
258
|
content_layer=ContentLayer.FURNITURE,
|
237
259
|
)
|
238
|
-
# remove
|
260
|
+
# remove script and style tags
|
239
261
|
for tag in self.soup(["script", "style"]):
|
240
262
|
tag.decompose()
|
263
|
+
# remove any hidden tag
|
264
|
+
for tag in self.soup(hidden=True):
|
265
|
+
tag.decompose()
|
266
|
+
|
241
267
|
content = self.soup.body or self.soup
|
242
268
|
# normalize <br> tags
|
243
269
|
for br in content("br"):
|
@@ -268,7 +294,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
268
294
|
def flush_buffer():
|
269
295
|
if not buffer:
|
270
296
|
return
|
271
|
-
annotated_text_list = buffer.simplify_text_elements()
|
297
|
+
annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
|
272
298
|
parts = annotated_text_list.split_by_newline()
|
273
299
|
buffer.clear()
|
274
300
|
|
@@ -276,20 +302,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
276
302
|
return
|
277
303
|
|
278
304
|
for annotated_text_list in parts:
|
279
|
-
with self.
|
305
|
+
with self._use_inline_group(annotated_text_list, doc):
|
280
306
|
for annotated_text in annotated_text_list:
|
281
307
|
if annotated_text.text.strip():
|
282
308
|
seg_clean = HTMLDocumentBackend._clean_unicode(
|
283
309
|
annotated_text.text.strip()
|
284
310
|
)
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
311
|
+
if annotated_text.code:
|
312
|
+
doc.add_code(
|
313
|
+
parent=self.parents[self.level],
|
314
|
+
text=seg_clean,
|
315
|
+
content_layer=self.content_layer,
|
316
|
+
formatting=annotated_text.formatting,
|
317
|
+
hyperlink=annotated_text.hyperlink,
|
318
|
+
)
|
319
|
+
else:
|
320
|
+
doc.add_text(
|
321
|
+
parent=self.parents[self.level],
|
322
|
+
label=DocItemLabel.TEXT,
|
323
|
+
text=seg_clean,
|
324
|
+
content_layer=self.content_layer,
|
325
|
+
formatting=annotated_text.formatting,
|
326
|
+
hyperlink=annotated_text.hyperlink,
|
327
|
+
)
|
293
328
|
|
294
329
|
for node in element.contents:
|
295
330
|
if isinstance(node, Tag):
|
@@ -298,10 +333,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
298
333
|
flush_buffer()
|
299
334
|
self._emit_image(node, doc)
|
300
335
|
elif name in _FORMAT_TAG_MAP:
|
301
|
-
with self.
|
336
|
+
with self._use_format([name]):
|
302
337
|
self._walk(node, doc)
|
303
338
|
elif name == "a":
|
304
|
-
with self.
|
339
|
+
with self._use_hyperlink(node):
|
305
340
|
self._walk(node, doc)
|
306
341
|
elif name in _BLOCK_TAGS:
|
307
342
|
flush_buffer()
|
@@ -367,8 +402,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
367
402
|
this_parent = item.parent
|
368
403
|
while this_parent is not None:
|
369
404
|
if this_parent.name == "a" and this_parent.get("href"):
|
370
|
-
with self.
|
371
|
-
with self.
|
405
|
+
with self._use_format(format_tags):
|
406
|
+
with self._use_hyperlink(this_parent):
|
372
407
|
return self._extract_text_and_hyperlink_recursively(
|
373
408
|
item, ignore_list
|
374
409
|
)
|
@@ -379,6 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
379
414
|
|
380
415
|
if isinstance(item, NavigableString):
|
381
416
|
text = item.strip()
|
417
|
+
code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
|
382
418
|
if text:
|
383
419
|
return AnnotatedTextList(
|
384
420
|
[
|
@@ -386,6 +422,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
386
422
|
text=text,
|
387
423
|
hyperlink=self.hyperlink,
|
388
424
|
formatting=self._formatting,
|
425
|
+
code=code,
|
389
426
|
)
|
390
427
|
]
|
391
428
|
)
|
@@ -396,6 +433,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
396
433
|
text="\n",
|
397
434
|
hyperlink=self.hyperlink,
|
398
435
|
formatting=self._formatting,
|
436
|
+
code=code,
|
399
437
|
)
|
400
438
|
]
|
401
439
|
)
|
@@ -405,14 +443,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
405
443
|
if not ignore_list or (tag.name not in ["ul", "ol"]):
|
406
444
|
for child in tag:
|
407
445
|
if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
|
408
|
-
with self.
|
446
|
+
with self._use_format([child.name]):
|
409
447
|
result.extend(
|
410
448
|
self._extract_text_and_hyperlink_recursively(
|
411
449
|
child, ignore_list, keep_newlines=keep_newlines
|
412
450
|
)
|
413
451
|
)
|
414
452
|
elif isinstance(child, Tag) and child.name == "a":
|
415
|
-
with self.
|
453
|
+
with self._use_hyperlink(child):
|
416
454
|
result.extend(
|
417
455
|
self._extract_text_and_hyperlink_recursively(
|
418
456
|
child, ignore_list, keep_newlines=keep_newlines
|
@@ -428,29 +466,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
428
466
|
return result
|
429
467
|
|
430
468
|
@contextmanager
|
431
|
-
def
|
469
|
+
def _use_hyperlink(self, tag: Tag):
|
432
470
|
this_href = tag.get("href")
|
433
471
|
if this_href is None:
|
434
472
|
yield None
|
435
473
|
else:
|
436
|
-
if this_href:
|
437
|
-
old_hyperlink = self.hyperlink
|
474
|
+
if isinstance(this_href, str) and this_href:
|
475
|
+
old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
|
476
|
+
new_hyperlink: Union[AnyUrl, Path, None] = None
|
438
477
|
if self.original_url is not None:
|
439
|
-
this_href = urljoin(self.original_url, this_href)
|
478
|
+
this_href = urljoin(str(self.original_url), str(this_href))
|
440
479
|
# ugly fix for relative links since pydantic does not support them.
|
441
480
|
try:
|
442
|
-
AnyUrl(this_href)
|
481
|
+
new_hyperlink = AnyUrl(this_href)
|
443
482
|
except ValidationError:
|
444
|
-
|
445
|
-
self.hyperlink =
|
483
|
+
new_hyperlink = Path(this_href)
|
484
|
+
self.hyperlink = new_hyperlink
|
446
485
|
try:
|
447
486
|
yield None
|
448
487
|
finally:
|
449
|
-
if
|
488
|
+
if new_hyperlink:
|
450
489
|
self.hyperlink = old_hyperlink
|
451
490
|
|
452
491
|
@contextmanager
|
453
|
-
def
|
492
|
+
def _use_format(self, tags: list[str]):
|
454
493
|
if not tags:
|
455
494
|
yield None
|
456
495
|
else:
|
@@ -461,7 +500,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
461
500
|
self.format_tags = self.format_tags[: -len(tags)]
|
462
501
|
|
463
502
|
@contextmanager
|
464
|
-
def
|
503
|
+
def _use_inline_group(
|
465
504
|
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
|
466
505
|
):
|
467
506
|
"""Create an inline group for annotated texts.
|
@@ -473,9 +512,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
473
512
|
Args:
|
474
513
|
annotated_text_list (AnnotatedTextList): Annotated text
|
475
514
|
doc (DoclingDocument): Currently used document
|
476
|
-
|
477
|
-
Yields:
|
478
|
-
None: _description_
|
479
515
|
"""
|
480
516
|
if len(annotated_text_list) > 1:
|
481
517
|
inline_fmt = doc.add_group(
|
@@ -493,6 +529,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
493
529
|
else:
|
494
530
|
yield None
|
495
531
|
|
532
|
+
@contextmanager
|
533
|
+
def _use_details(self, tag: Tag, doc: DoclingDocument):
|
534
|
+
"""Create a group with the content of a details tag.
|
535
|
+
|
536
|
+
While the context manager is active, the hierarchy level is set one
|
537
|
+
level higher as the cuurent parent.
|
538
|
+
|
539
|
+
Args:
|
540
|
+
tag: The details tag.
|
541
|
+
doc: Currently used document.
|
542
|
+
"""
|
543
|
+
self.parents[self.level + 1] = doc.add_group(
|
544
|
+
name=tag.name,
|
545
|
+
label=GroupLabel.SECTION,
|
546
|
+
parent=self.parents[self.level],
|
547
|
+
content_layer=self.content_layer,
|
548
|
+
)
|
549
|
+
self.level += 1
|
550
|
+
try:
|
551
|
+
yield None
|
552
|
+
finally:
|
553
|
+
self.parents[self.level + 1] = None
|
554
|
+
self.level -= 1
|
555
|
+
|
556
|
+
@contextmanager
|
557
|
+
def _use_footer(self, tag: Tag, doc: DoclingDocument):
|
558
|
+
"""Create a group with a footer.
|
559
|
+
|
560
|
+
Create a group with the content of a footer tag. While the context manager
|
561
|
+
is active, the hierarchy level is set one level higher as the cuurent parent.
|
562
|
+
|
563
|
+
Args:
|
564
|
+
tag: The footer tag.
|
565
|
+
doc: Currently used document.
|
566
|
+
"""
|
567
|
+
current_layer = self.content_layer
|
568
|
+
self.content_layer = ContentLayer.FURNITURE
|
569
|
+
self.parents[self.level + 1] = doc.add_group(
|
570
|
+
name=tag.name,
|
571
|
+
label=GroupLabel.SECTION,
|
572
|
+
parent=self.parents[self.level],
|
573
|
+
content_layer=self.content_layer,
|
574
|
+
)
|
575
|
+
self.level += 1
|
576
|
+
try:
|
577
|
+
yield None
|
578
|
+
finally:
|
579
|
+
self.parents[self.level + 1] = None
|
580
|
+
self.level -= 1
|
581
|
+
self.content_layer = current_layer
|
582
|
+
|
496
583
|
def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
|
497
584
|
tag_name = tag.name.lower()
|
498
585
|
# set default content layer to BODY as soon as we encounter a heading
|
@@ -611,20 +698,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
611
698
|
content_layer=self.content_layer,
|
612
699
|
)
|
613
700
|
self.level += 1
|
614
|
-
with self.
|
701
|
+
with self._use_inline_group(min_parts, doc):
|
615
702
|
for annotated_text in min_parts:
|
616
703
|
li_text = re.sub(
|
617
704
|
r"\s+|\n+", " ", annotated_text.text
|
618
705
|
).strip()
|
619
706
|
li_clean = HTMLDocumentBackend._clean_unicode(li_text)
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
707
|
+
if annotated_text.code:
|
708
|
+
doc.add_code(
|
709
|
+
parent=self.parents[self.level],
|
710
|
+
text=li_clean,
|
711
|
+
content_layer=self.content_layer,
|
712
|
+
formatting=annotated_text.formatting,
|
713
|
+
hyperlink=annotated_text.hyperlink,
|
714
|
+
)
|
715
|
+
else:
|
716
|
+
doc.add_text(
|
717
|
+
parent=self.parents[self.level],
|
718
|
+
label=DocItemLabel.TEXT,
|
719
|
+
text=li_clean,
|
720
|
+
content_layer=self.content_layer,
|
721
|
+
formatting=annotated_text.formatting,
|
722
|
+
hyperlink=annotated_text.hyperlink,
|
723
|
+
)
|
628
724
|
|
629
725
|
# 4) recurse into any nested lists, attaching them to this <li> item
|
630
726
|
for sublist in li({"ul", "ol"}, recursive=False):
|
@@ -687,20 +783,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
687
783
|
text_list = self._extract_text_and_hyperlink_recursively(
|
688
784
|
tag, find_parent_annotation=True
|
689
785
|
)
|
690
|
-
annotated_texts = text_list.simplify_text_elements()
|
786
|
+
annotated_texts: AnnotatedTextList = text_list.simplify_text_elements()
|
691
787
|
for part in annotated_texts.split_by_newline():
|
692
|
-
with self.
|
788
|
+
with self._use_inline_group(part, doc):
|
693
789
|
for annotated_text in part:
|
694
790
|
if seg := annotated_text.text.strip():
|
695
791
|
seg_clean = HTMLDocumentBackend._clean_unicode(seg)
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
792
|
+
if annotated_text.code:
|
793
|
+
doc.add_code(
|
794
|
+
parent=self.parents[self.level],
|
795
|
+
text=seg_clean,
|
796
|
+
content_layer=self.content_layer,
|
797
|
+
formatting=annotated_text.formatting,
|
798
|
+
hyperlink=annotated_text.hyperlink,
|
799
|
+
)
|
800
|
+
else:
|
801
|
+
doc.add_text(
|
802
|
+
parent=self.parents[self.level],
|
803
|
+
label=DocItemLabel.TEXT,
|
804
|
+
text=seg_clean,
|
805
|
+
content_layer=self.content_layer,
|
806
|
+
formatting=annotated_text.formatting,
|
807
|
+
hyperlink=annotated_text.hyperlink,
|
808
|
+
)
|
704
809
|
|
705
810
|
for img_tag in tag("img"):
|
706
811
|
if isinstance(img_tag, Tag):
|
@@ -718,13 +823,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
718
823
|
content_layer=self.content_layer,
|
719
824
|
)
|
720
825
|
|
721
|
-
elif tag_name in {"pre"
|
826
|
+
elif tag_name in {"pre"}:
|
722
827
|
# handle monospace code snippets (pre).
|
723
828
|
text_list = self._extract_text_and_hyperlink_recursively(
|
724
|
-
tag, find_parent_annotation=True
|
829
|
+
tag, find_parent_annotation=True, keep_newlines=True
|
725
830
|
)
|
726
831
|
annotated_texts = text_list.simplify_text_elements()
|
727
|
-
with self.
|
832
|
+
with self._use_inline_group(annotated_texts, doc):
|
728
833
|
for annotated_text in annotated_texts:
|
729
834
|
text_clean = HTMLDocumentBackend._clean_unicode(
|
730
835
|
annotated_text.text.strip()
|
@@ -737,22 +842,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
737
842
|
hyperlink=annotated_text.hyperlink,
|
738
843
|
)
|
739
844
|
|
740
|
-
elif tag_name
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
parent=self.parents[self.level],
|
748
|
-
content_layer=self.content_layer,
|
749
|
-
)
|
750
|
-
self.level += 1
|
751
|
-
self._walk(tag, doc)
|
752
|
-
self.parents[self.level + 1] = None
|
753
|
-
self.level -= 1
|
754
|
-
if tag_name == "footer":
|
755
|
-
self.content_layer = current_layer
|
845
|
+
elif tag_name == "footer":
|
846
|
+
with self._use_footer(tag, doc):
|
847
|
+
self._walk(tag, doc)
|
848
|
+
|
849
|
+
elif tag_name == "details":
|
850
|
+
with self._use_details(tag, doc):
|
851
|
+
self._walk(tag, doc)
|
756
852
|
|
757
853
|
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
758
854
|
figure = img_tag.find_parent("figure")
|
@@ -1,10 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Any, Union, cast
|
4
|
+
from typing import Any, Optional, Union, cast
|
5
5
|
|
6
6
|
from docling_core.types.doc import (
|
7
7
|
BoundingBox,
|
8
|
+
ContentLayer,
|
8
9
|
CoordOrigin,
|
9
10
|
DocItem,
|
10
11
|
DoclingDocument,
|
@@ -197,6 +198,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
197
198
|
parent=None,
|
198
199
|
label=GroupLabel.SECTION,
|
199
200
|
name=f"sheet: {sheet_name}",
|
201
|
+
content_layer=self._get_sheet_content_layer(sheet),
|
200
202
|
)
|
201
203
|
doc = self._convert_sheet(doc, sheet)
|
202
204
|
width, height = self._find_page_size(doc, page_no)
|
@@ -237,6 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
237
239
|
"""
|
238
240
|
|
239
241
|
if self.workbook is not None:
|
242
|
+
content_layer = self._get_sheet_content_layer(sheet)
|
240
243
|
tables = self._find_data_tables(sheet)
|
241
244
|
|
242
245
|
for excel_table in tables:
|
@@ -282,6 +285,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
282
285
|
origin=CoordOrigin.TOPLEFT,
|
283
286
|
),
|
284
287
|
),
|
288
|
+
content_layer=content_layer,
|
285
289
|
)
|
286
290
|
|
287
291
|
return doc
|
@@ -486,6 +490,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
486
490
|
The updated DoclingDocument.
|
487
491
|
"""
|
488
492
|
if self.workbook is not None:
|
493
|
+
content_layer = self._get_sheet_content_layer(sheet)
|
489
494
|
# Iterate over byte images in the sheet
|
490
495
|
for item in sheet._images: # type: ignore[attr-defined]
|
491
496
|
try:
|
@@ -511,6 +516,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
511
516
|
anchor, origin=CoordOrigin.TOPLEFT
|
512
517
|
),
|
513
518
|
),
|
519
|
+
content_layer=content_layer,
|
514
520
|
)
|
515
521
|
except Exception:
|
516
522
|
_log.error("could not extract the image from excel sheets")
|
@@ -536,3 +542,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
536
542
|
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
|
537
543
|
|
538
544
|
return (right - left, bottom - top)
|
545
|
+
|
546
|
+
@staticmethod
|
547
|
+
def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
|
548
|
+
return (
|
549
|
+
None
|
550
|
+
if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
|
551
|
+
else ContentLayer.INVISIBLE
|
552
|
+
)
|
@@ -254,16 +254,38 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
254
254
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
255
255
|
AREA_THRESHOLD = 0 # 32 * 32
|
256
256
|
page_size = self.get_size()
|
257
|
+
rotation = self._ppage.get_rotation()
|
258
|
+
|
257
259
|
with pypdfium2_lock:
|
258
260
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
259
261
|
pos = obj.get_pos()
|
262
|
+
if rotation == 90:
|
263
|
+
pos = (
|
264
|
+
pos[1],
|
265
|
+
page_size.height - pos[2],
|
266
|
+
pos[3],
|
267
|
+
page_size.height - pos[0],
|
268
|
+
)
|
269
|
+
elif rotation == 180:
|
270
|
+
pos = (
|
271
|
+
page_size.width - pos[2],
|
272
|
+
page_size.height - pos[3],
|
273
|
+
page_size.width - pos[0],
|
274
|
+
page_size.height - pos[1],
|
275
|
+
)
|
276
|
+
elif rotation == 270:
|
277
|
+
pos = (
|
278
|
+
page_size.width - pos[3],
|
279
|
+
pos[0],
|
280
|
+
page_size.width - pos[1],
|
281
|
+
pos[2],
|
282
|
+
)
|
283
|
+
|
260
284
|
cropbox = BoundingBox.from_tuple(
|
261
285
|
pos, origin=CoordOrigin.BOTTOMLEFT
|
262
286
|
).to_top_left_origin(page_height=page_size.height)
|
263
|
-
|
264
287
|
if cropbox.area() > AREA_THRESHOLD:
|
265
288
|
cropbox = cropbox.scaled(scale=scale)
|
266
|
-
|
267
289
|
yield cropbox
|
268
290
|
|
269
291
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import math
|
2
2
|
from collections import defaultdict
|
3
3
|
from enum import Enum
|
4
|
-
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
4
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
|
5
5
|
|
6
6
|
import numpy as np
|
7
7
|
from docling_core.types.doc import (
|
@@ -32,6 +32,18 @@ from pydantic import (
|
|
32
32
|
if TYPE_CHECKING:
|
33
33
|
from docling.backend.pdf_backend import PdfPageBackend
|
34
34
|
|
35
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
36
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
37
|
+
|
38
|
+
|
39
|
+
class BaseFormatOption(BaseModel):
|
40
|
+
"""Base class for format options used by _DocumentConversionInput."""
|
41
|
+
|
42
|
+
pipeline_options: Optional[PipelineOptions] = None
|
43
|
+
backend: Type[AbstractDocumentBackend]
|
44
|
+
|
45
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
46
|
+
|
35
47
|
|
36
48
|
class ConversionStatus(str, Enum):
|
37
49
|
PENDING = "pending"
|