docling-core 2.8.0__tar.gz → 2.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.8.0 → docling_core-2.10.0}/PKG-INFO +2 -1
- docling_core-2.10.0/docling_core/cli/__init__.py +1 -0
- docling_core-2.10.0/docling_core/cli/view.py +68 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/transforms/chunker/hybrid_chunker.py +5 -1
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/doc/document.py +102 -63
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/legacy_doc/base.py +1 -0
- docling_core-2.10.0/docling_core/utils/legacy.py +633 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/pyproject.toml +3 -1
- {docling_core-2.8.0 → docling_core-2.10.0}/LICENSE +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/README.md +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/__init__.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/py.typed +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/search/package.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/base.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.8.0 → docling_core-2.10.0}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.10.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -35,6 +35,7 @@ Requires-Dist: pyyaml (>=5.1,<7.0.0)
|
|
|
35
35
|
Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
|
|
36
36
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
37
37
|
Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
|
|
38
|
+
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
|
38
39
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
|
|
39
40
|
Project-URL: Repository, https://github.com/DS4SD/docling-core
|
|
40
41
|
Description-Content-Type: text/markdown
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""CLI package."""
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""CLI for docling viewer."""
|
|
7
|
+
import importlib
|
|
8
|
+
import tempfile
|
|
9
|
+
import webbrowser
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Annotated, Optional
|
|
12
|
+
|
|
13
|
+
import typer
|
|
14
|
+
|
|
15
|
+
from docling_core.types.doc import DoclingDocument
|
|
16
|
+
from docling_core.types.doc.base import ImageRefMode
|
|
17
|
+
from docling_core.utils.file import resolve_source_to_path
|
|
18
|
+
|
|
19
|
+
app = typer.Typer(
|
|
20
|
+
name="Docling",
|
|
21
|
+
no_args_is_help=True,
|
|
22
|
+
add_completion=False,
|
|
23
|
+
pretty_exceptions_enable=False,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def version_callback(value: bool):
|
|
28
|
+
"""Callback for version inspection."""
|
|
29
|
+
if value:
|
|
30
|
+
docling_core_version = importlib.metadata.version("docling-core")
|
|
31
|
+
print(f"Docling Core version: {docling_core_version}")
|
|
32
|
+
raise typer.Exit()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@app.command(no_args_is_help=True)
|
|
36
|
+
def view(
|
|
37
|
+
source: Annotated[
|
|
38
|
+
str,
|
|
39
|
+
typer.Argument(
|
|
40
|
+
...,
|
|
41
|
+
metavar="source",
|
|
42
|
+
help="Docling JSON file to view.",
|
|
43
|
+
),
|
|
44
|
+
],
|
|
45
|
+
version: Annotated[
|
|
46
|
+
Optional[bool],
|
|
47
|
+
typer.Option(
|
|
48
|
+
"--version",
|
|
49
|
+
callback=version_callback,
|
|
50
|
+
is_eager=True,
|
|
51
|
+
help="Show version information.",
|
|
52
|
+
),
|
|
53
|
+
] = None,
|
|
54
|
+
):
|
|
55
|
+
"""Display a Docling JSON file on the default browser."""
|
|
56
|
+
path = resolve_source_to_path(source=source)
|
|
57
|
+
doc = DoclingDocument.load_from_json(filename=path)
|
|
58
|
+
target_path = Path(tempfile.mkdtemp()) / "out.html"
|
|
59
|
+
html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
|
|
60
|
+
with open(target_path, "w") as f:
|
|
61
|
+
f.write(html_output)
|
|
62
|
+
webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
click_app = typer.main.get_command(app)
|
|
66
|
+
|
|
67
|
+
if __name__ == "__main__":
|
|
68
|
+
app()
|
{docling_core-2.8.0 → docling_core-2.10.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
@@ -44,7 +44,9 @@ class HybridChunker(BaseChunker):
|
|
|
44
44
|
|
|
45
45
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
46
46
|
|
|
47
|
-
tokenizer: Union[PreTrainedTokenizerBase, str]
|
|
47
|
+
tokenizer: Union[PreTrainedTokenizerBase, str] = (
|
|
48
|
+
"sentence-transformers/all-MiniLM-L6-v2"
|
|
49
|
+
)
|
|
48
50
|
max_tokens: int = None # type: ignore[assignment]
|
|
49
51
|
merge_peers: bool = True
|
|
50
52
|
|
|
@@ -96,6 +98,7 @@ class HybridChunker(BaseChunker):
|
|
|
96
98
|
doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],
|
|
97
99
|
headings=doc_chunk.meta.headings,
|
|
98
100
|
captions=doc_chunk.meta.captions,
|
|
101
|
+
origin=doc_chunk.meta.origin,
|
|
99
102
|
)
|
|
100
103
|
new_chunk = DocChunk(text=window_text, meta=meta)
|
|
101
104
|
return new_chunk
|
|
@@ -242,6 +245,7 @@ class HybridChunker(BaseChunker):
|
|
|
242
245
|
doc_items=window_items,
|
|
243
246
|
headings=current_headings_and_captions[0],
|
|
244
247
|
captions=current_headings_and_captions[1],
|
|
248
|
+
origin=chunk.meta.origin,
|
|
245
249
|
)
|
|
246
250
|
new_chunk = DocChunk(
|
|
247
251
|
text=window_text,
|
|
@@ -49,7 +49,6 @@ DEFAULT_EXPORT_LABELS = {
|
|
|
49
49
|
DocItemLabel.DOCUMENT_INDEX,
|
|
50
50
|
DocItemLabel.SECTION_HEADER,
|
|
51
51
|
DocItemLabel.PARAGRAPH,
|
|
52
|
-
DocItemLabel.CAPTION,
|
|
53
52
|
DocItemLabel.TABLE,
|
|
54
53
|
DocItemLabel.PICTURE,
|
|
55
54
|
DocItemLabel.FORMULA,
|
|
@@ -58,6 +57,7 @@ DEFAULT_EXPORT_LABELS = {
|
|
|
58
57
|
DocItemLabel.TEXT,
|
|
59
58
|
DocItemLabel.LIST_ITEM,
|
|
60
59
|
DocItemLabel.CODE,
|
|
60
|
+
DocItemLabel.REFERENCE,
|
|
61
61
|
}
|
|
62
62
|
|
|
63
63
|
|
|
@@ -380,6 +380,7 @@ class DocumentOrigin(BaseModel):
|
|
|
380
380
|
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
|
381
381
|
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
|
382
382
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
383
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
383
384
|
"text/asciidoc",
|
|
384
385
|
"text/markdown",
|
|
385
386
|
]
|
|
@@ -445,7 +446,7 @@ class ImageRef(BaseModel):
|
|
|
445
446
|
mimetype: str
|
|
446
447
|
dpi: int
|
|
447
448
|
size: Size
|
|
448
|
-
uri: Union[AnyUrl, Path]
|
|
449
|
+
uri: Union[AnyUrl, Path] = Field(union_mode="left_to_right")
|
|
449
450
|
_pil: Optional[PILImage.Image] = None
|
|
450
451
|
|
|
451
452
|
@property
|
|
@@ -592,6 +593,21 @@ class DocItem(
|
|
|
592
593
|
class TextItem(DocItem):
|
|
593
594
|
"""TextItem."""
|
|
594
595
|
|
|
596
|
+
label: typing.Literal[
|
|
597
|
+
DocItemLabel.CAPTION,
|
|
598
|
+
DocItemLabel.CHECKBOX_SELECTED,
|
|
599
|
+
DocItemLabel.CHECKBOX_UNSELECTED,
|
|
600
|
+
DocItemLabel.CODE,
|
|
601
|
+
DocItemLabel.FOOTNOTE,
|
|
602
|
+
DocItemLabel.FORMULA,
|
|
603
|
+
DocItemLabel.PAGE_FOOTER,
|
|
604
|
+
DocItemLabel.PAGE_HEADER,
|
|
605
|
+
DocItemLabel.PARAGRAPH,
|
|
606
|
+
DocItemLabel.REFERENCE,
|
|
607
|
+
DocItemLabel.TEXT,
|
|
608
|
+
DocItemLabel.TITLE,
|
|
609
|
+
]
|
|
610
|
+
|
|
595
611
|
orig: str # untreated representation
|
|
596
612
|
text: str # sanitized representation
|
|
597
613
|
|
|
@@ -643,8 +659,10 @@ class TextItem(DocItem):
|
|
|
643
659
|
class SectionHeaderItem(TextItem):
|
|
644
660
|
"""SectionItem."""
|
|
645
661
|
|
|
646
|
-
label: typing.Literal[DocItemLabel.SECTION_HEADER] =
|
|
647
|
-
|
|
662
|
+
label: typing.Literal[DocItemLabel.SECTION_HEADER] = (
|
|
663
|
+
DocItemLabel.SECTION_HEADER # type: ignore[assignment]
|
|
664
|
+
)
|
|
665
|
+
level: LevelNumber = 1
|
|
648
666
|
|
|
649
667
|
def export_to_document_tokens(
|
|
650
668
|
self,
|
|
@@ -694,9 +712,11 @@ class SectionHeaderItem(TextItem):
|
|
|
694
712
|
class ListItem(TextItem):
|
|
695
713
|
"""SectionItem."""
|
|
696
714
|
|
|
697
|
-
label: typing.Literal[DocItemLabel.LIST_ITEM] =
|
|
715
|
+
label: typing.Literal[DocItemLabel.LIST_ITEM] = (
|
|
716
|
+
DocItemLabel.LIST_ITEM # type: ignore[assignment]
|
|
717
|
+
)
|
|
698
718
|
enumerated: bool = False
|
|
699
|
-
marker: str # The bullet or number symbol that prefixes this list item
|
|
719
|
+
marker: str = "-" # The bullet or number symbol that prefixes this list item
|
|
700
720
|
|
|
701
721
|
|
|
702
722
|
class FloatingItem(DocItem):
|
|
@@ -922,7 +942,10 @@ class TableItem(FloatingItem):
|
|
|
922
942
|
"""TableItem."""
|
|
923
943
|
|
|
924
944
|
data: TableData
|
|
925
|
-
label: typing.Literal[
|
|
945
|
+
label: typing.Literal[
|
|
946
|
+
DocItemLabel.DOCUMENT_INDEX,
|
|
947
|
+
DocItemLabel.TABLE,
|
|
948
|
+
] = DocItemLabel.TABLE
|
|
926
949
|
|
|
927
950
|
def export_to_dataframe(self) -> pd.DataFrame:
|
|
928
951
|
"""Export the table as a Pandas DataFrame."""
|
|
@@ -1271,9 +1294,19 @@ class TableItem(FloatingItem):
|
|
|
1271
1294
|
class KeyValueItem(DocItem):
|
|
1272
1295
|
"""KeyValueItem."""
|
|
1273
1296
|
|
|
1297
|
+
label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
|
|
1274
1298
|
|
|
1275
|
-
|
|
1276
|
-
|
|
1299
|
+
|
|
1300
|
+
ContentItem = Annotated[
|
|
1301
|
+
Union[
|
|
1302
|
+
TextItem,
|
|
1303
|
+
SectionHeaderItem,
|
|
1304
|
+
ListItem,
|
|
1305
|
+
PictureItem,
|
|
1306
|
+
TableItem,
|
|
1307
|
+
KeyValueItem,
|
|
1308
|
+
],
|
|
1309
|
+
Field(discriminator="label"),
|
|
1277
1310
|
]
|
|
1278
1311
|
|
|
1279
1312
|
|
|
@@ -1375,13 +1408,13 @@ class DoclingDocument(BaseModel):
|
|
|
1375
1408
|
self,
|
|
1376
1409
|
label: Optional[GroupLabel] = None,
|
|
1377
1410
|
name: Optional[str] = None,
|
|
1378
|
-
parent: Optional[
|
|
1411
|
+
parent: Optional[NodeItem] = None,
|
|
1379
1412
|
) -> GroupItem:
|
|
1380
1413
|
"""add_group.
|
|
1381
1414
|
|
|
1382
1415
|
:param label: Optional[GroupLabel]: (Default value = None)
|
|
1383
1416
|
:param name: Optional[str]: (Default value = None)
|
|
1384
|
-
:param parent: Optional[
|
|
1417
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1385
1418
|
|
|
1386
1419
|
"""
|
|
1387
1420
|
if not parent:
|
|
@@ -1408,7 +1441,7 @@ class DoclingDocument(BaseModel):
|
|
|
1408
1441
|
marker: Optional[str] = None,
|
|
1409
1442
|
orig: Optional[str] = None,
|
|
1410
1443
|
prov: Optional[ProvenanceItem] = None,
|
|
1411
|
-
parent: Optional[
|
|
1444
|
+
parent: Optional[NodeItem] = None,
|
|
1412
1445
|
):
|
|
1413
1446
|
"""add_list_item.
|
|
1414
1447
|
|
|
@@ -1416,7 +1449,7 @@ class DoclingDocument(BaseModel):
|
|
|
1416
1449
|
:param text: str:
|
|
1417
1450
|
:param orig: Optional[str]: (Default value = None)
|
|
1418
1451
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1419
|
-
:param parent: Optional[
|
|
1452
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1420
1453
|
|
|
1421
1454
|
"""
|
|
1422
1455
|
if not parent:
|
|
@@ -1451,7 +1484,7 @@ class DoclingDocument(BaseModel):
|
|
|
1451
1484
|
text: str,
|
|
1452
1485
|
orig: Optional[str] = None,
|
|
1453
1486
|
prov: Optional[ProvenanceItem] = None,
|
|
1454
|
-
parent: Optional[
|
|
1487
|
+
parent: Optional[NodeItem] = None,
|
|
1455
1488
|
):
|
|
1456
1489
|
"""add_text.
|
|
1457
1490
|
|
|
@@ -1459,7 +1492,7 @@ class DoclingDocument(BaseModel):
|
|
|
1459
1492
|
:param text: str:
|
|
1460
1493
|
:param orig: Optional[str]: (Default value = None)
|
|
1461
1494
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1462
|
-
:param parent: Optional[
|
|
1495
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1463
1496
|
|
|
1464
1497
|
"""
|
|
1465
1498
|
# Catch a few cases that are in principle allowed
|
|
@@ -1503,15 +1536,16 @@ class DoclingDocument(BaseModel):
|
|
|
1503
1536
|
data: TableData,
|
|
1504
1537
|
caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
|
|
1505
1538
|
prov: Optional[ProvenanceItem] = None,
|
|
1506
|
-
parent: Optional[
|
|
1539
|
+
parent: Optional[NodeItem] = None,
|
|
1540
|
+
label: DocItemLabel = DocItemLabel.TABLE,
|
|
1507
1541
|
):
|
|
1508
1542
|
"""add_table.
|
|
1509
1543
|
|
|
1510
|
-
:param data:
|
|
1511
|
-
:param caption: Optional[Union[TextItem:
|
|
1512
|
-
:param
|
|
1513
|
-
:param
|
|
1514
|
-
:param
|
|
1544
|
+
:param data: TableData:
|
|
1545
|
+
:param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
|
|
1546
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1547
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1548
|
+
:param label: DocItemLabel: (Default value = DocItemLabel.TABLE)
|
|
1515
1549
|
|
|
1516
1550
|
"""
|
|
1517
1551
|
if not parent:
|
|
@@ -1521,7 +1555,7 @@ class DoclingDocument(BaseModel):
|
|
|
1521
1555
|
cref = f"#/tables/{table_index}"
|
|
1522
1556
|
|
|
1523
1557
|
tbl_item = TableItem(
|
|
1524
|
-
label=
|
|
1558
|
+
label=label, data=data, self_ref=cref, parent=parent.get_ref()
|
|
1525
1559
|
)
|
|
1526
1560
|
if prov:
|
|
1527
1561
|
tbl_item.prov.append(prov)
|
|
@@ -1539,7 +1573,7 @@ class DoclingDocument(BaseModel):
|
|
|
1539
1573
|
image: Optional[ImageRef] = None,
|
|
1540
1574
|
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
1541
1575
|
prov: Optional[ProvenanceItem] = None,
|
|
1542
|
-
parent: Optional[
|
|
1576
|
+
parent: Optional[NodeItem] = None,
|
|
1543
1577
|
):
|
|
1544
1578
|
"""add_picture.
|
|
1545
1579
|
|
|
@@ -1547,7 +1581,7 @@ class DoclingDocument(BaseModel):
|
|
|
1547
1581
|
:param caption: Optional[Union[TextItem:
|
|
1548
1582
|
:param RefItem]]: (Default value = None)
|
|
1549
1583
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1550
|
-
:param parent: Optional[
|
|
1584
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1551
1585
|
"""
|
|
1552
1586
|
if not parent:
|
|
1553
1587
|
parent = self.body
|
|
@@ -1577,14 +1611,14 @@ class DoclingDocument(BaseModel):
|
|
|
1577
1611
|
text: str,
|
|
1578
1612
|
orig: Optional[str] = None,
|
|
1579
1613
|
prov: Optional[ProvenanceItem] = None,
|
|
1580
|
-
parent: Optional[
|
|
1614
|
+
parent: Optional[NodeItem] = None,
|
|
1581
1615
|
):
|
|
1582
1616
|
"""add_title.
|
|
1583
1617
|
|
|
1584
1618
|
:param text: str:
|
|
1585
1619
|
:param orig: Optional[str]: (Default value = None)
|
|
1586
1620
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1587
|
-
:param parent: Optional[
|
|
1621
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1588
1622
|
"""
|
|
1589
1623
|
if not parent:
|
|
1590
1624
|
parent = self.body
|
|
@@ -1615,7 +1649,7 @@ class DoclingDocument(BaseModel):
|
|
|
1615
1649
|
orig: Optional[str] = None,
|
|
1616
1650
|
level: LevelNumber = 1,
|
|
1617
1651
|
prov: Optional[ProvenanceItem] = None,
|
|
1618
|
-
parent: Optional[
|
|
1652
|
+
parent: Optional[NodeItem] = None,
|
|
1619
1653
|
):
|
|
1620
1654
|
"""add_heading.
|
|
1621
1655
|
|
|
@@ -1624,7 +1658,7 @@ class DoclingDocument(BaseModel):
|
|
|
1624
1658
|
:param orig: Optional[str]: (Default value = None)
|
|
1625
1659
|
:param level: LevelNumber: (Default value = 1)
|
|
1626
1660
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1627
|
-
:param parent: Optional[
|
|
1661
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1628
1662
|
"""
|
|
1629
1663
|
if not parent:
|
|
1630
1664
|
parent = self.body
|
|
@@ -1668,7 +1702,7 @@ class DoclingDocument(BaseModel):
|
|
|
1668
1702
|
self,
|
|
1669
1703
|
root: Optional[NodeItem] = None,
|
|
1670
1704
|
with_groups: bool = False,
|
|
1671
|
-
traverse_pictures: bool =
|
|
1705
|
+
traverse_pictures: bool = False,
|
|
1672
1706
|
page_no: Optional[int] = None,
|
|
1673
1707
|
_level: int = 0, # fixed parameter, carries through the node nesting level
|
|
1674
1708
|
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
@@ -1685,30 +1719,31 @@ class DoclingDocument(BaseModel):
|
|
|
1685
1719
|
if not root:
|
|
1686
1720
|
root = self.body
|
|
1687
1721
|
|
|
1722
|
+
# Yield non-group items or group items when with_groups=True
|
|
1688
1723
|
if not isinstance(root, GroupItem) or with_groups:
|
|
1689
1724
|
if isinstance(root, DocItem):
|
|
1690
|
-
if page_no is
|
|
1691
|
-
for prov in root.prov
|
|
1692
|
-
|
|
1693
|
-
yield root, _level
|
|
1694
|
-
else:
|
|
1725
|
+
if page_no is None or any(
|
|
1726
|
+
prov.page_no == page_no for prov in root.prov
|
|
1727
|
+
):
|
|
1695
1728
|
yield root, _level
|
|
1696
1729
|
else:
|
|
1697
1730
|
yield root, _level
|
|
1698
1731
|
|
|
1732
|
+
# Handle picture traversal - only traverse children if requested
|
|
1733
|
+
if isinstance(root, PictureItem) and not traverse_pictures:
|
|
1734
|
+
return
|
|
1735
|
+
|
|
1699
1736
|
# Traverse children
|
|
1700
1737
|
for child_ref in root.children:
|
|
1701
1738
|
child = child_ref.resolve(self)
|
|
1702
|
-
|
|
1703
1739
|
if isinstance(child, NodeItem):
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
)
|
|
1740
|
+
yield from self.iterate_items(
|
|
1741
|
+
child,
|
|
1742
|
+
with_groups=with_groups,
|
|
1743
|
+
traverse_pictures=traverse_pictures,
|
|
1744
|
+
page_no=page_no,
|
|
1745
|
+
_level=_level + 1,
|
|
1746
|
+
)
|
|
1712
1747
|
|
|
1713
1748
|
def _clear_picture_pil_cache(self):
|
|
1714
1749
|
"""Clear cache storage of all images."""
|
|
@@ -1864,7 +1899,7 @@ class DoclingDocument(BaseModel):
|
|
|
1864
1899
|
|
|
1865
1900
|
"""
|
|
1866
1901
|
with open(filename, "r") as f:
|
|
1867
|
-
return cls.
|
|
1902
|
+
return cls.model_validate_json(f.read())
|
|
1868
1903
|
|
|
1869
1904
|
def save_as_yaml(
|
|
1870
1905
|
self,
|
|
@@ -2053,10 +2088,6 @@ class DoclingDocument(BaseModel):
|
|
|
2053
2088
|
text = f"```\n{item.text}\n```\n"
|
|
2054
2089
|
mdtexts.append(text)
|
|
2055
2090
|
|
|
2056
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
2057
|
-
# captions are printed in picture and table ... skipping for now
|
|
2058
|
-
continue
|
|
2059
|
-
|
|
2060
2091
|
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2061
2092
|
in_list = True
|
|
2062
2093
|
# Calculate indent based on list_nesting_level
|
|
@@ -2115,10 +2146,30 @@ class DoclingDocument(BaseModel):
|
|
|
2115
2146
|
# Bold, Italic, or Bold-Italic
|
|
2116
2147
|
# Hence, any underscore that we print into Markdown is coming from document text
|
|
2117
2148
|
# That means we need to escape it, to properly reflect content in the markdown
|
|
2149
|
+
# However, we need to preserve underscores in image URLs
|
|
2150
|
+
# to maintain their validity
|
|
2151
|
+
# For example:  should remain unchanged
|
|
2118
2152
|
def escape_underscores(text):
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2153
|
+
"""Escape underscores but leave them intact in the URL.."""
|
|
2154
|
+
# Firstly, identify all the URL patterns.
|
|
2155
|
+
url_pattern = r"!\[.*?\]\((.*?)\)"
|
|
2156
|
+
parts = []
|
|
2157
|
+
last_end = 0
|
|
2158
|
+
|
|
2159
|
+
for match in re.finditer(url_pattern, text):
|
|
2160
|
+
# Text to add before the URL (needs to be escaped)
|
|
2161
|
+
before_url = text[last_end : match.start()]
|
|
2162
|
+
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
|
|
2163
|
+
|
|
2164
|
+
# Add the full URL part (do not escape)
|
|
2165
|
+
parts.append(match.group(0))
|
|
2166
|
+
last_end = match.end()
|
|
2167
|
+
|
|
2168
|
+
# Add the final part of the text (which needs to be escaped)
|
|
2169
|
+
if last_end < len(text):
|
|
2170
|
+
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
|
|
2171
|
+
|
|
2172
|
+
return "".join(parts)
|
|
2122
2173
|
|
|
2123
2174
|
mdtext = escape_underscores(mdtext)
|
|
2124
2175
|
|
|
@@ -2328,10 +2379,6 @@ class DoclingDocument(BaseModel):
|
|
|
2328
2379
|
text = f"<pre>{item.text}</pre>"
|
|
2329
2380
|
html_texts.append(text)
|
|
2330
2381
|
|
|
2331
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
2332
|
-
# captions are printed in picture and table ... skipping for now
|
|
2333
|
-
continue
|
|
2334
|
-
|
|
2335
2382
|
elif isinstance(item, ListItem):
|
|
2336
2383
|
|
|
2337
2384
|
text = f"<li>{item.text}</li>"
|
|
@@ -2533,10 +2580,6 @@ class DoclingDocument(BaseModel):
|
|
|
2533
2580
|
result += f"<unordered_list>{delim}"
|
|
2534
2581
|
in_ordered_list.append(False)
|
|
2535
2582
|
|
|
2536
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
2537
|
-
# captions are printed in picture and table ... skipping for now
|
|
2538
|
-
continue
|
|
2539
|
-
|
|
2540
2583
|
elif isinstance(item, SectionHeaderItem):
|
|
2541
2584
|
|
|
2542
2585
|
result += item.export_to_document_tokens(
|
|
@@ -2642,10 +2685,6 @@ class DoclingDocument(BaseModel):
|
|
|
2642
2685
|
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
2643
2686
|
)
|
|
2644
2687
|
|
|
2645
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
2646
|
-
# captions are printed in picture and table ... skipping for now
|
|
2647
|
-
continue
|
|
2648
|
-
|
|
2649
2688
|
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2650
2689
|
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
2651
2690
|
|