docling 2.38.0__tar.gz → 2.39.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.38.0 → docling-2.39.0}/PKG-INFO +2 -2
- {docling-2.38.0 → docling-2.39.0}/docling/backend/html_backend.py +31 -42
- {docling-2.38.0 → docling-2.39.0}/docling/backend/md_backend.py +148 -50
- {docling-2.38.0 → docling-2.39.0}/docling/backend/mspowerpoint_backend.py +4 -5
- {docling-2.38.0 → docling-2.39.0}/docling/backend/msword_backend.py +36 -37
- {docling-2.38.0 → docling-2.39.0}/docling/datamodel/base_models.py +1 -1
- {docling-2.38.0 → docling-2.39.0}/docling/datamodel/pipeline_options.py +1 -1
- {docling-2.38.0 → docling-2.39.0}/docling.egg-info/PKG-INFO +2 -2
- {docling-2.38.0 → docling-2.39.0}/docling.egg-info/requires.txt +1 -1
- {docling-2.38.0 → docling-2.39.0}/pyproject.toml +2 -2
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_pptx.py +2 -2
- {docling-2.38.0 → docling-2.39.0}/LICENSE +0 -0
- {docling-2.38.0 → docling-2.39.0}/README.md +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/chunking/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/cli/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/cli/main.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/cli/models.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/cli/tools.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/datamodel/document.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/datamodel/settings.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/document_converter.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/exceptions.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/base_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/layout_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/py.typed +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/__init__.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/export.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/locks.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/orientation.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/profiling.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/utils.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling/utils/visualization.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling.egg-info/SOURCES.txt +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.38.0 → docling-2.39.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.38.0 → docling-2.39.0}/setup.cfg +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_csv.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_html.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_jats.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_msword.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_backend_webp.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_cli.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_code_formula.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_input_doc.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_interfaces.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_invalid_input.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_options.py +0 -0
- {docling-2.38.0 → docling-2.39.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.39.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
|
|
26
26
|
Description-Content-Type: text/markdown
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
|
-
Requires-Dist: docling-core[chunking]<3.0.0,>=2.
|
29
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
|
30
30
|
Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
|
31
31
|
Requires-Dist: docling-parse<5.0.0,>=4.0.0
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
@@ -17,6 +17,7 @@ from docling_core.types.doc import (
|
|
17
17
|
TableData,
|
18
18
|
)
|
19
19
|
from docling_core.types.doc.document import ContentLayer
|
20
|
+
from pydantic import BaseModel
|
20
21
|
from typing_extensions import override
|
21
22
|
|
22
23
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
@@ -48,6 +49,11 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
|
48
49
|
]
|
49
50
|
|
50
51
|
|
52
|
+
class _Context(BaseModel):
|
53
|
+
list_ordered_flag_by_ref: dict[str, bool] = {}
|
54
|
+
list_start_by_ref: dict[str, int] = {}
|
55
|
+
|
56
|
+
|
51
57
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
52
58
|
@override
|
53
59
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
@@ -59,6 +65,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
59
65
|
self.max_levels = 10
|
60
66
|
self.level = 0
|
61
67
|
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
68
|
+
self.ctx = _Context()
|
62
69
|
for i in range(self.max_levels):
|
63
70
|
self.parents[i] = None
|
64
71
|
|
@@ -121,6 +128,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
121
128
|
self.content_layer = (
|
122
129
|
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
123
130
|
)
|
131
|
+
self.ctx = _Context() # reset context
|
124
132
|
self.walk(content, doc)
|
125
133
|
else:
|
126
134
|
raise RuntimeError(
|
@@ -294,28 +302,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
294
302
|
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
295
303
|
"""Handles list tags (ul, ol) and their list items."""
|
296
304
|
|
297
|
-
|
298
|
-
|
299
|
-
self.parents[self.level + 1] = doc.add_group(
|
300
|
-
parent=self.parents[self.level],
|
301
|
-
name="list",
|
302
|
-
label=GroupLabel.LIST,
|
303
|
-
content_layer=self.content_layer,
|
304
|
-
)
|
305
|
-
elif element.name == "ol":
|
305
|
+
start: Optional[int] = None
|
306
|
+
if is_ordered := element.name == "ol":
|
306
307
|
start_attr = element.get("start")
|
307
|
-
|
308
|
-
int(start_attr)
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
308
|
+
if isinstance(start_attr, str) and start_attr.isnumeric():
|
309
|
+
start = int(start_attr)
|
310
|
+
name = "ordered list" + (f" start {start}" if start is not None else "")
|
311
|
+
else:
|
312
|
+
name = "list"
|
313
|
+
# create a list group
|
314
|
+
list_group = doc.add_list_group(
|
315
|
+
name=name,
|
316
|
+
parent=self.parents[self.level],
|
317
|
+
content_layer=self.content_layer,
|
318
|
+
)
|
319
|
+
self.parents[self.level + 1] = list_group
|
320
|
+
self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
|
321
|
+
if is_ordered and start is not None:
|
322
|
+
self.ctx.list_start_by_ref[list_group.self_ref] = start
|
323
|
+
|
319
324
|
self.level += 1
|
320
325
|
|
321
326
|
self.walk(element, doc)
|
@@ -331,16 +336,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
331
336
|
if parent is None:
|
332
337
|
_log.debug(f"list-item has no parent in DoclingDocument: {element}")
|
333
338
|
return
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
and parent.name
|
340
|
-
):
|
341
|
-
start_in_list: str = parent.name.split(" ")[-1]
|
342
|
-
start: int = int(start_in_list) if start_in_list.isnumeric() else 1
|
343
|
-
index_in_list += start - 1
|
339
|
+
enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False)
|
340
|
+
if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)):
|
341
|
+
marker = f"{start + len(parent.children)}."
|
342
|
+
else:
|
343
|
+
marker = ""
|
344
344
|
|
345
345
|
if nested_list:
|
346
346
|
# Text in list item can be hidden within hierarchy, hence
|
@@ -350,12 +350,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
350
350
|
text = text.replace("\n", "").replace("\r", "")
|
351
351
|
text = " ".join(text.split()).strip()
|
352
352
|
|
353
|
-
marker = ""
|
354
|
-
enumerated = False
|
355
|
-
if parent_label == GroupLabel.ORDERED_LIST:
|
356
|
-
marker = str(index_in_list)
|
357
|
-
enumerated = True
|
358
|
-
|
359
353
|
if len(text) > 0:
|
360
354
|
# create a list-item
|
361
355
|
self.parents[self.level + 1] = doc.add_list_item(
|
@@ -375,11 +369,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
375
369
|
elif element.text.strip():
|
376
370
|
text = element.text.strip()
|
377
371
|
|
378
|
-
marker = ""
|
379
|
-
enumerated = False
|
380
|
-
if parent_label == GroupLabel.ORDERED_LIST:
|
381
|
-
marker = f"{index_in_list!s}."
|
382
|
-
enumerated = True
|
383
372
|
doc.add_list_item(
|
384
373
|
text=text,
|
385
374
|
enumerated=enumerated,
|
@@ -2,9 +2,10 @@ import logging
|
|
2
2
|
import re
|
3
3
|
import warnings
|
4
4
|
from copy import deepcopy
|
5
|
+
from enum import Enum
|
5
6
|
from io import BytesIO
|
6
7
|
from pathlib import Path
|
7
|
-
from typing import List, Optional, Set, Union
|
8
|
+
from typing import List, Literal, Optional, Set, Union
|
8
9
|
|
9
10
|
import marko
|
10
11
|
import marko.element
|
@@ -13,15 +14,15 @@ from docling_core.types.doc import (
|
|
13
14
|
DocItemLabel,
|
14
15
|
DoclingDocument,
|
15
16
|
DocumentOrigin,
|
16
|
-
GroupLabel,
|
17
17
|
NodeItem,
|
18
18
|
TableCell,
|
19
19
|
TableData,
|
20
20
|
TextItem,
|
21
21
|
)
|
22
|
-
from docling_core.types.doc.document import Formatting
|
22
|
+
from docling_core.types.doc.document import Formatting
|
23
23
|
from marko import Markdown
|
24
|
-
from pydantic import AnyUrl, TypeAdapter
|
24
|
+
from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
|
25
|
+
from typing_extensions import Annotated
|
25
26
|
|
26
27
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
27
28
|
from docling.backend.html_backend import HTMLDocumentBackend
|
@@ -35,6 +36,32 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
|
35
36
|
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
36
37
|
|
37
38
|
|
39
|
+
class _PendingCreationType(str, Enum):
|
40
|
+
"""CoordOrigin."""
|
41
|
+
|
42
|
+
HEADING = "heading"
|
43
|
+
LIST_ITEM = "list_item"
|
44
|
+
|
45
|
+
|
46
|
+
class _HeadingCreationPayload(BaseModel):
|
47
|
+
kind: Literal["heading"] = "heading"
|
48
|
+
level: int
|
49
|
+
|
50
|
+
|
51
|
+
class _ListItemCreationPayload(BaseModel):
|
52
|
+
kind: Literal["list_item"] = "list_item"
|
53
|
+
enumerated: bool
|
54
|
+
|
55
|
+
|
56
|
+
_CreationPayload = Annotated[
|
57
|
+
Union[
|
58
|
+
_HeadingCreationPayload,
|
59
|
+
_ListItemCreationPayload,
|
60
|
+
],
|
61
|
+
Field(discriminator="kind"),
|
62
|
+
]
|
63
|
+
|
64
|
+
|
38
65
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
39
66
|
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
40
67
|
# This regex will match any sequence of underscores
|
@@ -155,6 +182,50 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
155
182
|
doc.add_table(data=table_data)
|
156
183
|
return
|
157
184
|
|
185
|
+
def _create_list_item(
|
186
|
+
self,
|
187
|
+
doc: DoclingDocument,
|
188
|
+
parent_item: Optional[NodeItem],
|
189
|
+
text: str,
|
190
|
+
enumerated: bool,
|
191
|
+
formatting: Optional[Formatting] = None,
|
192
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
193
|
+
):
|
194
|
+
item = doc.add_list_item(
|
195
|
+
text=text,
|
196
|
+
enumerated=enumerated,
|
197
|
+
parent=parent_item,
|
198
|
+
formatting=formatting,
|
199
|
+
hyperlink=hyperlink,
|
200
|
+
)
|
201
|
+
return item
|
202
|
+
|
203
|
+
def _create_heading_item(
|
204
|
+
self,
|
205
|
+
doc: DoclingDocument,
|
206
|
+
parent_item: Optional[NodeItem],
|
207
|
+
text: str,
|
208
|
+
level: int,
|
209
|
+
formatting: Optional[Formatting] = None,
|
210
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
211
|
+
):
|
212
|
+
if level == 1:
|
213
|
+
item = doc.add_title(
|
214
|
+
text=text,
|
215
|
+
parent=parent_item,
|
216
|
+
formatting=formatting,
|
217
|
+
hyperlink=hyperlink,
|
218
|
+
)
|
219
|
+
else:
|
220
|
+
item = doc.add_heading(
|
221
|
+
text=text,
|
222
|
+
level=level - 1,
|
223
|
+
parent=parent_item,
|
224
|
+
formatting=formatting,
|
225
|
+
hyperlink=hyperlink,
|
226
|
+
)
|
227
|
+
return item
|
228
|
+
|
158
229
|
def _iterate_elements( # noqa: C901
|
159
230
|
self,
|
160
231
|
*,
|
@@ -162,6 +233,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
162
233
|
depth: int,
|
163
234
|
doc: DoclingDocument,
|
164
235
|
visited: Set[marko.element.Element],
|
236
|
+
creation_stack: list[
|
237
|
+
_CreationPayload
|
238
|
+
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
239
|
+
list_ordered_flag_by_ref: dict[str, bool],
|
165
240
|
parent_item: Optional[NodeItem] = None,
|
166
241
|
formatting: Optional[Formatting] = None,
|
167
242
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
@@ -177,28 +252,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
177
252
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
178
253
|
)
|
179
254
|
|
180
|
-
if len(element.children)
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
if element.level == 1:
|
188
|
-
parent_item = doc.add_title(
|
189
|
-
text=snippet_text,
|
190
|
-
parent=parent_item,
|
255
|
+
if len(element.children) > 1: # inline group will be created further down
|
256
|
+
parent_item = self._create_heading_item(
|
257
|
+
doc=doc,
|
258
|
+
parent_item=parent_item,
|
259
|
+
text="",
|
260
|
+
level=element.level,
|
191
261
|
formatting=formatting,
|
192
262
|
hyperlink=hyperlink,
|
193
263
|
)
|
194
264
|
else:
|
195
|
-
|
196
|
-
text=snippet_text,
|
197
|
-
level=element.level - 1,
|
198
|
-
parent=parent_item,
|
199
|
-
formatting=formatting,
|
200
|
-
hyperlink=hyperlink,
|
201
|
-
)
|
265
|
+
creation_stack.append(_HeadingCreationPayload(level=element.level))
|
202
266
|
|
203
267
|
elif isinstance(element, marko.block.List):
|
204
268
|
has_non_empty_list_items = False
|
@@ -210,10 +274,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
210
274
|
self._close_table(doc)
|
211
275
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
212
276
|
if has_non_empty_list_items:
|
213
|
-
|
214
|
-
parent_item =
|
215
|
-
label=label, name="list", parent=parent_item
|
216
|
-
)
|
277
|
+
parent_item = doc.add_list_group(name="list", parent=parent_item)
|
278
|
+
list_ordered_flag_by_ref[parent_item.self_ref] = element.ordered
|
217
279
|
|
218
280
|
elif (
|
219
281
|
isinstance(element, marko.block.ListItem)
|
@@ -224,22 +286,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
224
286
|
self._close_table(doc)
|
225
287
|
_log.debug(" - List item")
|
226
288
|
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
snippet_text = "" # inline group will be created
|
232
|
-
is_numbered = isinstance(parent_item, OrderedList)
|
233
|
-
if not isinstance(parent_item, (OrderedList, UnorderedList)):
|
234
|
-
_log.warning("ListItem would have not had a list parent, adding one.")
|
235
|
-
parent_item = doc.add_unordered_list(parent=parent_item)
|
236
|
-
parent_item = doc.add_list_item(
|
237
|
-
enumerated=is_numbered,
|
238
|
-
parent=parent_item,
|
239
|
-
text=snippet_text,
|
240
|
-
formatting=formatting,
|
241
|
-
hyperlink=hyperlink,
|
289
|
+
enumerated = (
|
290
|
+
list_ordered_flag_by_ref.get(parent_item.self_ref, False)
|
291
|
+
if parent_item
|
292
|
+
else False
|
242
293
|
)
|
294
|
+
if len(child.children) > 1: # inline group will be created further down
|
295
|
+
parent_item = self._create_list_item(
|
296
|
+
doc=doc,
|
297
|
+
parent_item=parent_item,
|
298
|
+
text="",
|
299
|
+
enumerated=enumerated,
|
300
|
+
formatting=formatting,
|
301
|
+
hyperlink=hyperlink,
|
302
|
+
)
|
303
|
+
else:
|
304
|
+
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
|
243
305
|
|
244
306
|
elif isinstance(element, marko.inline.Image):
|
245
307
|
self._close_table(doc)
|
@@ -276,7 +338,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
276
338
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
277
339
|
snippet_text = element.children.strip()
|
278
340
|
# Detect start of the table:
|
279
|
-
if "|" in snippet_text:
|
341
|
+
if "|" in snippet_text or self.in_table:
|
280
342
|
# most likely part of the markdown table
|
281
343
|
self.in_table = True
|
282
344
|
if len(self.md_table_buffer) > 0:
|
@@ -285,13 +347,46 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
285
347
|
self.md_table_buffer.append(snippet_text)
|
286
348
|
elif snippet_text:
|
287
349
|
self._close_table(doc)
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
350
|
+
|
351
|
+
if creation_stack:
|
352
|
+
while len(creation_stack) > 0:
|
353
|
+
to_create = creation_stack.pop()
|
354
|
+
if isinstance(to_create, _ListItemCreationPayload):
|
355
|
+
enumerated = (
|
356
|
+
list_ordered_flag_by_ref.get(
|
357
|
+
parent_item.self_ref, False
|
358
|
+
)
|
359
|
+
if parent_item
|
360
|
+
else False
|
361
|
+
)
|
362
|
+
parent_item = self._create_list_item(
|
363
|
+
doc=doc,
|
364
|
+
parent_item=parent_item,
|
365
|
+
text=snippet_text,
|
366
|
+
enumerated=enumerated,
|
367
|
+
formatting=formatting,
|
368
|
+
hyperlink=hyperlink,
|
369
|
+
)
|
370
|
+
elif isinstance(to_create, _HeadingCreationPayload):
|
371
|
+
# not keeping as parent_item as logic for correctly tracking
|
372
|
+
# that not implemented yet (section components not captured
|
373
|
+
# as heading children in marko)
|
374
|
+
self._create_heading_item(
|
375
|
+
doc=doc,
|
376
|
+
parent_item=parent_item,
|
377
|
+
text=snippet_text,
|
378
|
+
level=to_create.level,
|
379
|
+
formatting=formatting,
|
380
|
+
hyperlink=hyperlink,
|
381
|
+
)
|
382
|
+
else:
|
383
|
+
doc.add_text(
|
384
|
+
label=DocItemLabel.TEXT,
|
385
|
+
parent=parent_item,
|
386
|
+
text=snippet_text,
|
387
|
+
formatting=formatting,
|
388
|
+
hyperlink=hyperlink,
|
389
|
+
)
|
295
390
|
|
296
391
|
elif isinstance(element, marko.inline.CodeSpan):
|
297
392
|
self._close_table(doc)
|
@@ -353,7 +448,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
353
448
|
parent_item = doc.add_inline_group(parent=parent_item)
|
354
449
|
|
355
450
|
processed_block_types = (
|
356
|
-
# marko.block.Heading,
|
357
451
|
marko.block.CodeBlock,
|
358
452
|
marko.block.FencedCode,
|
359
453
|
marko.inline.RawText,
|
@@ -369,6 +463,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
369
463
|
depth=depth + 1,
|
370
464
|
doc=doc,
|
371
465
|
visited=visited,
|
466
|
+
creation_stack=creation_stack,
|
467
|
+
list_ordered_flag_by_ref=list_ordered_flag_by_ref,
|
372
468
|
parent_item=parent_item,
|
373
469
|
formatting=formatting,
|
374
470
|
hyperlink=hyperlink,
|
@@ -412,6 +508,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
412
508
|
doc=doc,
|
413
509
|
parent_item=None,
|
414
510
|
visited=set(),
|
511
|
+
creation_stack=[],
|
512
|
+
list_ordered_flag_by_ref={},
|
415
513
|
)
|
416
514
|
self._close_table(doc=doc) # handle any last hanging table
|
417
515
|
|
@@ -121,7 +121,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
121
121
|
|
122
122
|
return prov
|
123
123
|
|
124
|
-
def handle_text_elements(
|
124
|
+
def handle_text_elements(
|
125
|
+
self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size
|
126
|
+
):
|
125
127
|
is_list_group_created = False
|
126
128
|
enum_list_item_value = 0
|
127
129
|
new_list = None
|
@@ -165,10 +167,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
165
167
|
enumerated = bullet_type == "Numbered"
|
166
168
|
|
167
169
|
if not is_list_group_created:
|
168
|
-
new_list = doc.
|
169
|
-
label=GroupLabel.ORDERED_LIST
|
170
|
-
if enumerated
|
171
|
-
else GroupLabel.LIST,
|
170
|
+
new_list = doc.add_list_group(
|
172
171
|
name="list",
|
173
172
|
parent=parent_slide,
|
174
173
|
)
|
@@ -10,11 +10,12 @@ from docling_core.types.doc import (
|
|
10
10
|
DocumentOrigin,
|
11
11
|
GroupLabel,
|
12
12
|
ImageRef,
|
13
|
+
ListGroup,
|
13
14
|
NodeItem,
|
14
15
|
TableCell,
|
15
16
|
TableData,
|
16
17
|
)
|
17
|
-
from docling_core.types.doc.document import Formatting
|
18
|
+
from docling_core.types.doc.document import Formatting
|
18
19
|
from docx import Document
|
19
20
|
from docx.document import Document as DocxDocument
|
20
21
|
from docx.oxml.table import CT_Tc
|
@@ -397,7 +398,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
397
398
|
if isinstance(c, Hyperlink):
|
398
399
|
text = c.text
|
399
400
|
hyperlink = Path(c.address)
|
400
|
-
format =
|
401
|
+
format = (
|
402
|
+
self._get_format_from_run(c.runs[0])
|
403
|
+
if c.runs and len(c.runs) > 0
|
404
|
+
else None
|
405
|
+
)
|
401
406
|
elif isinstance(c, Run):
|
402
407
|
text = c.text
|
403
408
|
hyperlink = None
|
@@ -684,7 +689,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
684
689
|
paragraph_elements: list,
|
685
690
|
) -> Optional[NodeItem]:
|
686
691
|
return (
|
687
|
-
doc.
|
692
|
+
doc.add_inline_group(parent=prev_parent)
|
688
693
|
if len(paragraph_elements) > 1
|
689
694
|
else prev_parent
|
690
695
|
)
|
@@ -777,9 +782,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
777
782
|
else:
|
778
783
|
# Inline equation
|
779
784
|
level = self._get_level()
|
780
|
-
inline_equation = doc.
|
781
|
-
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
782
|
-
)
|
785
|
+
inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
|
783
786
|
text_tmp = text
|
784
787
|
for eq in equations:
|
785
788
|
if len(text_tmp) == 0:
|
@@ -927,18 +930,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
927
930
|
level: int,
|
928
931
|
) -> None:
|
929
932
|
# This should not happen by construction
|
930
|
-
if not isinstance(self.parents[level],
|
933
|
+
if not isinstance(self.parents[level], ListGroup):
|
934
|
+
return
|
935
|
+
if not elements:
|
931
936
|
return
|
937
|
+
|
932
938
|
if len(elements) == 1:
|
933
939
|
text, format, hyperlink = elements[0]
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
940
|
+
if text:
|
941
|
+
doc.add_list_item(
|
942
|
+
marker=marker,
|
943
|
+
enumerated=enumerated,
|
944
|
+
parent=self.parents[level],
|
945
|
+
text=text,
|
946
|
+
formatting=format,
|
947
|
+
hyperlink=hyperlink,
|
948
|
+
)
|
942
949
|
else:
|
943
950
|
new_item = doc.add_list_item(
|
944
951
|
marker=marker,
|
@@ -946,15 +953,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
946
953
|
parent=self.parents[level],
|
947
954
|
text="",
|
948
955
|
)
|
949
|
-
new_parent = doc.
|
956
|
+
new_parent = doc.add_inline_group(parent=new_item)
|
950
957
|
for text, format, hyperlink in elements:
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
+
if text:
|
959
|
+
doc.add_text(
|
960
|
+
label=DocItemLabel.TEXT,
|
961
|
+
parent=new_parent,
|
962
|
+
text=text,
|
963
|
+
formatting=format,
|
964
|
+
hyperlink=hyperlink,
|
965
|
+
)
|
958
966
|
|
959
967
|
def _add_list_item(
|
960
968
|
self,
|
@@ -975,8 +983,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
975
983
|
if self._prev_numid() is None: # Open new list
|
976
984
|
self.level_at_new_list = level
|
977
985
|
|
978
|
-
self.parents[level] = doc.
|
979
|
-
|
986
|
+
self.parents[level] = doc.add_list_group(
|
987
|
+
name="list", parent=self.parents[level - 1]
|
980
988
|
)
|
981
989
|
|
982
990
|
# Set marker and enumerated arguments if this is an enumeration element.
|
@@ -997,19 +1005,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
997
1005
|
self.level_at_new_list + prev_indent + 1,
|
998
1006
|
self.level_at_new_list + ilevel + 1,
|
999
1007
|
):
|
1000
|
-
# Determine if this is an unordered list or an ordered list.
|
1001
|
-
# Set GroupLabel.ORDERED_LIST when it fits.
|
1002
1008
|
self.listIter = 0
|
1003
|
-
|
1004
|
-
self.parents[i
|
1005
|
-
|
1006
|
-
name="list",
|
1007
|
-
parent=self.parents[i - 1],
|
1008
|
-
)
|
1009
|
-
else:
|
1010
|
-
self.parents[i] = doc.add_group(
|
1011
|
-
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
|
1012
|
-
)
|
1009
|
+
self.parents[i] = doc.add_list_group(
|
1010
|
+
name="list", parent=self.parents[i - 1]
|
1011
|
+
)
|
1013
1012
|
|
1014
1013
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
1015
1014
|
self.listIter += 1
|
@@ -207,7 +207,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
|
|
207
207
|
|
208
208
|
# GraniteVision
|
209
209
|
granite_picture_description = PictureDescriptionVlmOptions(
|
210
|
-
repo_id="ibm-granite/granite-vision-3.
|
210
|
+
repo_id="ibm-granite/granite-vision-3.2-2b-preview",
|
211
211
|
prompt="What is shown in this image?",
|
212
212
|
)
|
213
213
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.39.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
|
|
26
26
|
Description-Content-Type: text/markdown
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
|
-
Requires-Dist: docling-core[chunking]<3.0.0,>=2.
|
29
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
|
30
30
|
Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
|
31
31
|
Requires-Dist: docling-parse<5.0.0,>=4.0.0
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.39.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
license = "MIT"
|
6
6
|
keywords = [
|
@@ -44,7 +44,7 @@ authors = [
|
|
44
44
|
requires-python = '>=3.9,<4.0'
|
45
45
|
dependencies = [
|
46
46
|
'pydantic (>=2.0.0,<3.0.0)',
|
47
|
-
'docling-core[chunking] (>=2.
|
47
|
+
'docling-core[chunking] (>=2.39.0,<3.0.0)',
|
48
48
|
'docling-ibm-models (>=3.4.4,<4.0.0)',
|
49
49
|
'docling-parse (>=4.0.0,<5.0.0)',
|
50
50
|
'filetype (>=1.2.0,<2.0.0)',
|
@@ -41,12 +41,12 @@ def test_e2e_pptx_conversions():
|
|
41
41
|
doc: DoclingDocument = conv_result.document
|
42
42
|
|
43
43
|
pred_md: str = doc.export_to_markdown()
|
44
|
-
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
|
44
|
+
assert verify_export(pred_md, str(gt_path) + ".md", GENERATE), "export to md"
|
45
45
|
|
46
46
|
pred_itxt: str = doc._export_to_indented_text(
|
47
47
|
max_text_len=70, explicit_tables=False
|
48
48
|
)
|
49
|
-
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
|
49
|
+
assert verify_export(pred_itxt, str(gt_path) + ".itxt", GENERATE), (
|
50
50
|
"export to indented-text"
|
51
51
|
)
|
52
52
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|