docling 2.38.1__tar.gz → 2.40.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.38.1 → docling-2.40.0}/PKG-INFO +4 -4
- {docling-2.38.1 → docling-2.40.0}/docling/backend/docling_parse_v4_backend.py +14 -4
- {docling-2.38.1 → docling-2.40.0}/docling/backend/html_backend.py +31 -42
- {docling-2.38.1 → docling-2.40.0}/docling/backend/md_backend.py +25 -12
- {docling-2.38.1 → docling-2.40.0}/docling/backend/msexcel_backend.py +33 -14
- {docling-2.38.1 → docling-2.40.0}/docling/backend/mspowerpoint_backend.py +4 -5
- {docling-2.38.1 → docling-2.40.0}/docling/backend/msword_backend.py +31 -36
- {docling-2.38.1 → docling-2.40.0}/docling/datamodel/pipeline_options.py +8 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/base_ocr_model.py +6 -2
- {docling-2.38.1 → docling-2.40.0}/docling/models/layout_model.py +10 -3
- {docling-2.38.1 → docling-2.40.0}/docling/models/picture_description_vlm_model.py +16 -11
- docling-2.40.0/docling/models/plugins/defaults.py +28 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/readingorder_model.py +8 -1
- {docling-2.38.1 → docling-2.40.0}/docling/models/table_structure_model.py +3 -1
- {docling-2.38.1 → docling-2.40.0}/docling/models/tesseract_ocr_model.py +10 -4
- {docling-2.38.1 → docling-2.40.0}/docling/pipeline/standard_pdf_pipeline.py +1 -0
- {docling-2.38.1 → docling-2.40.0}/docling/utils/accelerator_utils.py +2 -2
- {docling-2.38.1 → docling-2.40.0}/docling/utils/layout_postprocessor.py +7 -2
- {docling-2.38.1 → docling-2.40.0}/docling.egg-info/PKG-INFO +4 -4
- {docling-2.38.1 → docling-2.40.0}/docling.egg-info/requires.txt +3 -3
- {docling-2.38.1 → docling-2.40.0}/pyproject.toml +4 -4
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_docling_parse_v4.py +17 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_pptx.py +2 -2
- docling-2.38.1/docling/models/plugins/defaults.py +0 -28
- {docling-2.38.1 → docling-2.40.0}/LICENSE +0 -0
- {docling-2.38.1 → docling-2.40.0}/README.md +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/chunking/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/cli/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/cli/main.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/cli/models.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/cli/tools.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/datamodel/document.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/datamodel/settings.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/document_converter.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/exceptions.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/base_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/py.typed +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/utils/__init__.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/utils/export.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/utils/locks.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/utils/orientation.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/utils/profiling.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/utils/utils.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling/utils/visualization.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling.egg-info/SOURCES.txt +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.38.1 → docling-2.40.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.38.1 → docling-2.40.0}/setup.cfg +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_csv.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_html.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_jats.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_msword.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_backend_webp.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_cli.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_code_formula.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_input_doc.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_interfaces.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_invalid_input.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_options.py +0 -0
- {docling-2.38.1 → docling-2.40.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.40.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -26,9 +26,9 @@ Requires-Python: <4.0,>=3.9
|
|
26
26
|
Description-Content-Type: text/markdown
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
|
-
Requires-Dist: docling-core[chunking]<3.0.0,>=2.
|
30
|
-
Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
|
29
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
|
31
30
|
Requires-Dist: docling-parse<5.0.0,>=4.0.0
|
31
|
+
Requires-Dist: docling-ibm-models<4,>=3.6.0
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
33
|
Requires-Dist: pypdfium2<5.0.0,>=4.30.0
|
34
34
|
Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
|
@@ -57,7 +57,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
|
|
57
57
|
Provides-Extra: vlm
|
58
58
|
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
59
59
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
60
|
-
Requires-Dist: mlx-vlm
|
60
|
+
Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
61
61
|
Provides-Extra: rapidocr
|
62
62
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
63
63
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
@@ -187,7 +187,17 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|
187
187
|
|
188
188
|
def unload(self):
|
189
189
|
super().unload()
|
190
|
-
|
191
|
-
|
192
|
-
self.
|
193
|
-
|
190
|
+
# Unload docling-parse document first
|
191
|
+
if self.dp_doc is not None:
|
192
|
+
self.dp_doc.unload()
|
193
|
+
self.dp_doc = None
|
194
|
+
|
195
|
+
# Then close pypdfium2 document with proper locking
|
196
|
+
if self._pdoc is not None:
|
197
|
+
with pypdfium2_lock:
|
198
|
+
try:
|
199
|
+
self._pdoc.close()
|
200
|
+
except Exception:
|
201
|
+
# Ignore cleanup errors
|
202
|
+
pass
|
203
|
+
self._pdoc = None
|
@@ -17,6 +17,7 @@ from docling_core.types.doc import (
|
|
17
17
|
TableData,
|
18
18
|
)
|
19
19
|
from docling_core.types.doc.document import ContentLayer
|
20
|
+
from pydantic import BaseModel
|
20
21
|
from typing_extensions import override
|
21
22
|
|
22
23
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
@@ -48,6 +49,11 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
|
48
49
|
]
|
49
50
|
|
50
51
|
|
52
|
+
class _Context(BaseModel):
|
53
|
+
list_ordered_flag_by_ref: dict[str, bool] = {}
|
54
|
+
list_start_by_ref: dict[str, int] = {}
|
55
|
+
|
56
|
+
|
51
57
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
52
58
|
@override
|
53
59
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
@@ -59,6 +65,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
59
65
|
self.max_levels = 10
|
60
66
|
self.level = 0
|
61
67
|
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
68
|
+
self.ctx = _Context()
|
62
69
|
for i in range(self.max_levels):
|
63
70
|
self.parents[i] = None
|
64
71
|
|
@@ -121,6 +128,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
121
128
|
self.content_layer = (
|
122
129
|
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
123
130
|
)
|
131
|
+
self.ctx = _Context() # reset context
|
124
132
|
self.walk(content, doc)
|
125
133
|
else:
|
126
134
|
raise RuntimeError(
|
@@ -294,28 +302,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
294
302
|
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
295
303
|
"""Handles list tags (ul, ol) and their list items."""
|
296
304
|
|
297
|
-
|
298
|
-
|
299
|
-
self.parents[self.level + 1] = doc.add_group(
|
300
|
-
parent=self.parents[self.level],
|
301
|
-
name="list",
|
302
|
-
label=GroupLabel.LIST,
|
303
|
-
content_layer=self.content_layer,
|
304
|
-
)
|
305
|
-
elif element.name == "ol":
|
305
|
+
start: Optional[int] = None
|
306
|
+
if is_ordered := element.name == "ol":
|
306
307
|
start_attr = element.get("start")
|
307
|
-
|
308
|
-
int(start_attr)
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
308
|
+
if isinstance(start_attr, str) and start_attr.isnumeric():
|
309
|
+
start = int(start_attr)
|
310
|
+
name = "ordered list" + (f" start {start}" if start is not None else "")
|
311
|
+
else:
|
312
|
+
name = "list"
|
313
|
+
# create a list group
|
314
|
+
list_group = doc.add_list_group(
|
315
|
+
name=name,
|
316
|
+
parent=self.parents[self.level],
|
317
|
+
content_layer=self.content_layer,
|
318
|
+
)
|
319
|
+
self.parents[self.level + 1] = list_group
|
320
|
+
self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
|
321
|
+
if is_ordered and start is not None:
|
322
|
+
self.ctx.list_start_by_ref[list_group.self_ref] = start
|
323
|
+
|
319
324
|
self.level += 1
|
320
325
|
|
321
326
|
self.walk(element, doc)
|
@@ -331,16 +336,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
331
336
|
if parent is None:
|
332
337
|
_log.debug(f"list-item has no parent in DoclingDocument: {element}")
|
333
338
|
return
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
and parent.name
|
340
|
-
):
|
341
|
-
start_in_list: str = parent.name.split(" ")[-1]
|
342
|
-
start: int = int(start_in_list) if start_in_list.isnumeric() else 1
|
343
|
-
index_in_list += start - 1
|
339
|
+
enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False)
|
340
|
+
if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)):
|
341
|
+
marker = f"{start + len(parent.children)}."
|
342
|
+
else:
|
343
|
+
marker = ""
|
344
344
|
|
345
345
|
if nested_list:
|
346
346
|
# Text in list item can be hidden within hierarchy, hence
|
@@ -350,12 +350,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
350
350
|
text = text.replace("\n", "").replace("\r", "")
|
351
351
|
text = " ".join(text.split()).strip()
|
352
352
|
|
353
|
-
marker = ""
|
354
|
-
enumerated = False
|
355
|
-
if parent_label == GroupLabel.ORDERED_LIST:
|
356
|
-
marker = str(index_in_list)
|
357
|
-
enumerated = True
|
358
|
-
|
359
353
|
if len(text) > 0:
|
360
354
|
# create a list-item
|
361
355
|
self.parents[self.level + 1] = doc.add_list_item(
|
@@ -375,11 +369,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
375
369
|
elif element.text.strip():
|
376
370
|
text = element.text.strip()
|
377
371
|
|
378
|
-
marker = ""
|
379
|
-
enumerated = False
|
380
|
-
if parent_label == GroupLabel.ORDERED_LIST:
|
381
|
-
marker = f"{index_in_list!s}."
|
382
|
-
enumerated = True
|
383
372
|
doc.add_list_item(
|
384
373
|
text=text,
|
385
374
|
enumerated=enumerated,
|
@@ -14,13 +14,12 @@ from docling_core.types.doc import (
|
|
14
14
|
DocItemLabel,
|
15
15
|
DoclingDocument,
|
16
16
|
DocumentOrigin,
|
17
|
-
GroupLabel,
|
18
17
|
NodeItem,
|
19
18
|
TableCell,
|
20
19
|
TableData,
|
21
20
|
TextItem,
|
22
21
|
)
|
23
|
-
from docling_core.types.doc.document import Formatting
|
22
|
+
from docling_core.types.doc.document import Formatting
|
24
23
|
from marko import Markdown
|
25
24
|
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
|
26
25
|
from typing_extensions import Annotated
|
@@ -51,6 +50,7 @@ class _HeadingCreationPayload(BaseModel):
|
|
51
50
|
|
52
51
|
class _ListItemCreationPayload(BaseModel):
|
53
52
|
kind: Literal["list_item"] = "list_item"
|
53
|
+
enumerated: bool
|
54
54
|
|
55
55
|
|
56
56
|
_CreationPayload = Annotated[
|
@@ -187,15 +187,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
187
187
|
doc: DoclingDocument,
|
188
188
|
parent_item: Optional[NodeItem],
|
189
189
|
text: str,
|
190
|
+
enumerated: bool,
|
190
191
|
formatting: Optional[Formatting] = None,
|
191
192
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
192
193
|
):
|
193
|
-
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
194
|
-
_log.warning("ListItem would have not had a list parent, adding one.")
|
195
|
-
parent_item = doc.add_unordered_list(parent=parent_item)
|
196
194
|
item = doc.add_list_item(
|
197
195
|
text=text,
|
198
|
-
enumerated=
|
196
|
+
enumerated=enumerated,
|
199
197
|
parent=parent_item,
|
200
198
|
formatting=formatting,
|
201
199
|
hyperlink=hyperlink,
|
@@ -238,6 +236,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
238
236
|
creation_stack: list[
|
239
237
|
_CreationPayload
|
240
238
|
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
239
|
+
list_ordered_flag_by_ref: dict[str, bool],
|
241
240
|
parent_item: Optional[NodeItem] = None,
|
242
241
|
formatting: Optional[Formatting] = None,
|
243
242
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
@@ -275,10 +274,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
275
274
|
self._close_table(doc)
|
276
275
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
277
276
|
if has_non_empty_list_items:
|
278
|
-
|
279
|
-
parent_item =
|
280
|
-
label=label, name="list", parent=parent_item
|
281
|
-
)
|
277
|
+
parent_item = doc.add_list_group(name="list", parent=parent_item)
|
278
|
+
list_ordered_flag_by_ref[parent_item.self_ref] = element.ordered
|
282
279
|
|
283
280
|
elif (
|
284
281
|
isinstance(element, marko.block.ListItem)
|
@@ -289,16 +286,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
289
286
|
self._close_table(doc)
|
290
287
|
_log.debug(" - List item")
|
291
288
|
|
289
|
+
enumerated = (
|
290
|
+
list_ordered_flag_by_ref.get(parent_item.self_ref, False)
|
291
|
+
if parent_item
|
292
|
+
else False
|
293
|
+
)
|
292
294
|
if len(child.children) > 1: # inline group will be created further down
|
293
295
|
parent_item = self._create_list_item(
|
294
296
|
doc=doc,
|
295
297
|
parent_item=parent_item,
|
296
298
|
text="",
|
299
|
+
enumerated=enumerated,
|
297
300
|
formatting=formatting,
|
298
301
|
hyperlink=hyperlink,
|
299
302
|
)
|
300
303
|
else:
|
301
|
-
creation_stack.append(_ListItemCreationPayload())
|
304
|
+
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
|
302
305
|
|
303
306
|
elif isinstance(element, marko.inline.Image):
|
304
307
|
self._close_table(doc)
|
@@ -335,7 +338,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
335
338
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
336
339
|
snippet_text = element.children.strip()
|
337
340
|
# Detect start of the table:
|
338
|
-
if "|" in snippet_text:
|
341
|
+
if "|" in snippet_text or self.in_table:
|
339
342
|
# most likely part of the markdown table
|
340
343
|
self.in_table = True
|
341
344
|
if len(self.md_table_buffer) > 0:
|
@@ -349,10 +352,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
349
352
|
while len(creation_stack) > 0:
|
350
353
|
to_create = creation_stack.pop()
|
351
354
|
if isinstance(to_create, _ListItemCreationPayload):
|
355
|
+
enumerated = (
|
356
|
+
list_ordered_flag_by_ref.get(
|
357
|
+
parent_item.self_ref, False
|
358
|
+
)
|
359
|
+
if parent_item
|
360
|
+
else False
|
361
|
+
)
|
352
362
|
parent_item = self._create_list_item(
|
353
363
|
doc=doc,
|
354
364
|
parent_item=parent_item,
|
355
365
|
text=snippet_text,
|
366
|
+
enumerated=enumerated,
|
356
367
|
formatting=formatting,
|
357
368
|
hyperlink=hyperlink,
|
358
369
|
)
|
@@ -453,6 +464,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
453
464
|
doc=doc,
|
454
465
|
visited=visited,
|
455
466
|
creation_stack=creation_stack,
|
467
|
+
list_ordered_flag_by_ref=list_ordered_flag_by_ref,
|
456
468
|
parent_item=parent_item,
|
457
469
|
formatting=formatting,
|
458
470
|
hyperlink=hyperlink,
|
@@ -497,6 +509,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
497
509
|
parent_item=None,
|
498
510
|
visited=set(),
|
499
511
|
creation_stack=[],
|
512
|
+
list_ordered_flag_by_ref={},
|
500
513
|
)
|
501
514
|
self._close_table(doc=doc) # handle any last hanging table
|
502
515
|
|
@@ -337,10 +337,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
337
337
|
# Collect the data within the bounds
|
338
338
|
data = []
|
339
339
|
visited_cells: set[tuple[int, int]] = set()
|
340
|
-
for ri in
|
341
|
-
|
342
|
-
|
343
|
-
|
340
|
+
for ri, row in enumerate(
|
341
|
+
sheet.iter_rows(
|
342
|
+
min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
|
343
|
+
max_row=max_row + 1,
|
344
|
+
min_col=start_col + 1,
|
345
|
+
max_col=max_col + 1,
|
346
|
+
values_only=False,
|
347
|
+
),
|
348
|
+
start_row,
|
349
|
+
):
|
350
|
+
for rj, cell in enumerate(row, start_col):
|
344
351
|
# Check if the cell belongs to a merged range
|
345
352
|
row_span = 1
|
346
353
|
col_span = 1
|
@@ -397,10 +404,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
397
404
|
"""
|
398
405
|
max_row: int = start_row
|
399
406
|
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
407
|
+
for ri, (cell,) in enumerate(
|
408
|
+
sheet.iter_rows(
|
409
|
+
min_row=start_row + 2,
|
410
|
+
max_row=sheet.max_row,
|
411
|
+
min_col=start_col + 1,
|
412
|
+
max_col=start_col + 1,
|
413
|
+
values_only=False,
|
414
|
+
),
|
415
|
+
start_row + 1,
|
416
|
+
):
|
404
417
|
# Check if the cell is part of a merged range
|
405
418
|
merged_range = next(
|
406
419
|
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
|
@@ -414,7 +427,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
414
427
|
if merged_range:
|
415
428
|
max_row = max(max_row, merged_range.max_row - 1)
|
416
429
|
else:
|
417
|
-
max_row
|
430
|
+
max_row = ri
|
418
431
|
|
419
432
|
return max_row
|
420
433
|
|
@@ -433,10 +446,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
433
446
|
"""
|
434
447
|
max_col: int = start_col
|
435
448
|
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
449
|
+
for rj, (cell,) in enumerate(
|
450
|
+
sheet.iter_cols(
|
451
|
+
min_row=start_row + 1,
|
452
|
+
max_row=start_row + 1,
|
453
|
+
min_col=start_col + 2,
|
454
|
+
max_col=sheet.max_column,
|
455
|
+
values_only=False,
|
456
|
+
),
|
457
|
+
start_col + 1,
|
458
|
+
):
|
440
459
|
# Check if the cell is part of a merged range
|
441
460
|
merged_range = next(
|
442
461
|
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
|
@@ -450,7 +469,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
450
469
|
if merged_range:
|
451
470
|
max_col = max(max_col, merged_range.max_col - 1)
|
452
471
|
else:
|
453
|
-
max_col
|
472
|
+
max_col = rj
|
454
473
|
|
455
474
|
return max_col
|
456
475
|
|
@@ -121,7 +121,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
121
121
|
|
122
122
|
return prov
|
123
123
|
|
124
|
-
def handle_text_elements(
|
124
|
+
def handle_text_elements(
|
125
|
+
self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size
|
126
|
+
):
|
125
127
|
is_list_group_created = False
|
126
128
|
enum_list_item_value = 0
|
127
129
|
new_list = None
|
@@ -165,10 +167,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
165
167
|
enumerated = bullet_type == "Numbered"
|
166
168
|
|
167
169
|
if not is_list_group_created:
|
168
|
-
new_list = doc.
|
169
|
-
label=GroupLabel.ORDERED_LIST
|
170
|
-
if enumerated
|
171
|
-
else GroupLabel.LIST,
|
170
|
+
new_list = doc.add_list_group(
|
172
171
|
name="list",
|
173
172
|
parent=parent_slide,
|
174
173
|
)
|
@@ -10,11 +10,12 @@ from docling_core.types.doc import (
|
|
10
10
|
DocumentOrigin,
|
11
11
|
GroupLabel,
|
12
12
|
ImageRef,
|
13
|
+
ListGroup,
|
13
14
|
NodeItem,
|
14
15
|
TableCell,
|
15
16
|
TableData,
|
16
17
|
)
|
17
|
-
from docling_core.types.doc.document import Formatting
|
18
|
+
from docling_core.types.doc.document import Formatting
|
18
19
|
from docx import Document
|
19
20
|
from docx.document import Document as DocxDocument
|
20
21
|
from docx.oxml.table import CT_Tc
|
@@ -688,7 +689,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
688
689
|
paragraph_elements: list,
|
689
690
|
) -> Optional[NodeItem]:
|
690
691
|
return (
|
691
|
-
doc.
|
692
|
+
doc.add_inline_group(parent=prev_parent)
|
692
693
|
if len(paragraph_elements) > 1
|
693
694
|
else prev_parent
|
694
695
|
)
|
@@ -781,9 +782,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
781
782
|
else:
|
782
783
|
# Inline equation
|
783
784
|
level = self._get_level()
|
784
|
-
inline_equation = doc.
|
785
|
-
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
786
|
-
)
|
785
|
+
inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
|
787
786
|
text_tmp = text
|
788
787
|
for eq in equations:
|
789
788
|
if len(text_tmp) == 0:
|
@@ -931,18 +930,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
931
930
|
level: int,
|
932
931
|
) -> None:
|
933
932
|
# This should not happen by construction
|
934
|
-
if not isinstance(self.parents[level],
|
933
|
+
if not isinstance(self.parents[level], ListGroup):
|
934
|
+
return
|
935
|
+
if not elements:
|
935
936
|
return
|
937
|
+
|
936
938
|
if len(elements) == 1:
|
937
939
|
text, format, hyperlink = elements[0]
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
940
|
+
if text:
|
941
|
+
doc.add_list_item(
|
942
|
+
marker=marker,
|
943
|
+
enumerated=enumerated,
|
944
|
+
parent=self.parents[level],
|
945
|
+
text=text,
|
946
|
+
formatting=format,
|
947
|
+
hyperlink=hyperlink,
|
948
|
+
)
|
946
949
|
else:
|
947
950
|
new_item = doc.add_list_item(
|
948
951
|
marker=marker,
|
@@ -950,15 +953,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
950
953
|
parent=self.parents[level],
|
951
954
|
text="",
|
952
955
|
)
|
953
|
-
new_parent = doc.
|
956
|
+
new_parent = doc.add_inline_group(parent=new_item)
|
954
957
|
for text, format, hyperlink in elements:
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
958
|
+
if text:
|
959
|
+
doc.add_text(
|
960
|
+
label=DocItemLabel.TEXT,
|
961
|
+
parent=new_parent,
|
962
|
+
text=text,
|
963
|
+
formatting=format,
|
964
|
+
hyperlink=hyperlink,
|
965
|
+
)
|
962
966
|
|
963
967
|
def _add_list_item(
|
964
968
|
self,
|
@@ -979,8 +983,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
979
983
|
if self._prev_numid() is None: # Open new list
|
980
984
|
self.level_at_new_list = level
|
981
985
|
|
982
|
-
self.parents[level] = doc.
|
983
|
-
|
986
|
+
self.parents[level] = doc.add_list_group(
|
987
|
+
name="list", parent=self.parents[level - 1]
|
984
988
|
)
|
985
989
|
|
986
990
|
# Set marker and enumerated arguments if this is an enumeration element.
|
@@ -1001,19 +1005,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1001
1005
|
self.level_at_new_list + prev_indent + 1,
|
1002
1006
|
self.level_at_new_list + ilevel + 1,
|
1003
1007
|
):
|
1004
|
-
# Determine if this is an unordered list or an ordered list.
|
1005
|
-
# Set GroupLabel.ORDERED_LIST when it fits.
|
1006
1008
|
self.listIter = 0
|
1007
|
-
|
1008
|
-
self.parents[i
|
1009
|
-
|
1010
|
-
name="list",
|
1011
|
-
parent=self.parents[i - 1],
|
1012
|
-
)
|
1013
|
-
else:
|
1014
|
-
self.parents[i] = doc.add_group(
|
1015
|
-
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
|
1016
|
-
)
|
1009
|
+
self.parents[i] = doc.add_list_group(
|
1010
|
+
name="list", parent=self.parents[i - 1]
|
1011
|
+
)
|
1017
1012
|
|
1018
1013
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
1019
1014
|
self.listIter += 1
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
from datetime import datetime
|
2
3
|
from enum import Enum
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
@@ -265,6 +266,12 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
265
266
|
)
|
266
267
|
|
267
268
|
|
269
|
+
class LayoutOptions(BaseModel):
|
270
|
+
"""Options for layout processing."""
|
271
|
+
|
272
|
+
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
|
273
|
+
|
274
|
+
|
268
275
|
class AsrPipelineOptions(PipelineOptions):
|
269
276
|
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
270
277
|
artifacts_path: Optional[Union[Path, str]] = None
|
@@ -289,6 +296,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
289
296
|
picture_description_options: PictureDescriptionBaseOptions = (
|
290
297
|
smolvlm_picture_description
|
291
298
|
)
|
299
|
+
layout_options: LayoutOptions = LayoutOptions()
|
292
300
|
|
293
301
|
images_scale: float = 1.0
|
294
302
|
generate_page_images: bool = False
|
@@ -3,14 +3,13 @@ import logging
|
|
3
3
|
from abc import abstractmethod
|
4
4
|
from collections.abc import Iterable
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import List, Optional, Type
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Type
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
10
10
|
from docling_core.types.doc.page import TextCell
|
11
11
|
from PIL import Image, ImageDraw
|
12
12
|
from rtree import index
|
13
|
-
from scipy.ndimage import binary_dilation, find_objects, label
|
14
13
|
|
15
14
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
16
15
|
from docling.datamodel.base_models import Page
|
@@ -31,11 +30,16 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
|
31
30
|
options: OcrOptions,
|
32
31
|
accelerator_options: AcceleratorOptions,
|
33
32
|
):
|
33
|
+
# Make sure any delay/error from import occurs on ocr model init and not first use
|
34
|
+
from scipy.ndimage import binary_dilation, find_objects, label
|
35
|
+
|
34
36
|
self.enabled = enabled
|
35
37
|
self.options = options
|
36
38
|
|
37
39
|
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
38
40
|
def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
|
41
|
+
from scipy.ndimage import binary_dilation, find_objects, label
|
42
|
+
|
39
43
|
BITMAP_COVERAGE_TRESHOLD = 0.75
|
40
44
|
assert page.size is not None
|
41
45
|
|
@@ -7,12 +7,12 @@ from typing import Optional
|
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from docling_core.types.doc import DocItemLabel
|
10
|
-
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
11
10
|
from PIL import Image
|
12
11
|
|
13
12
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
14
13
|
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
|
15
14
|
from docling.datamodel.document import ConversionResult
|
15
|
+
from docling.datamodel.pipeline_options import LayoutOptions
|
16
16
|
from docling.datamodel.settings import settings
|
17
17
|
from docling.models.base_model import BasePageModel
|
18
18
|
from docling.models.utils.hf_model_download import download_hf_model
|
@@ -49,8 +49,15 @@ class LayoutModel(BasePageModel):
|
|
49
49
|
CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
|
50
50
|
|
51
51
|
def __init__(
|
52
|
-
self,
|
52
|
+
self,
|
53
|
+
artifacts_path: Optional[Path],
|
54
|
+
accelerator_options: AcceleratorOptions,
|
55
|
+
options: LayoutOptions,
|
53
56
|
):
|
57
|
+
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
58
|
+
|
59
|
+
self.options = options
|
60
|
+
|
54
61
|
device = decide_device(accelerator_options.device)
|
55
62
|
|
56
63
|
if artifacts_path is None:
|
@@ -176,7 +183,7 @@ class LayoutModel(BasePageModel):
|
|
176
183
|
# Apply postprocessing
|
177
184
|
|
178
185
|
processed_clusters, processed_cells = LayoutPostprocessor(
|
179
|
-
page, clusters
|
186
|
+
page, clusters, self.options
|
180
187
|
).postprocess()
|
181
188
|
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
182
189
|
|