docling-core 2.17.1__tar.gz → 2.18.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.17.1 → docling_core-2.18.0}/PKG-INFO +2 -2
- {docling_core-2.17.1 → docling_core-2.18.0}/README.md +1 -1
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/doc/document.py +142 -28
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/doc/utils.py +27 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/utils/legacy.py +3 -3
- {docling_core-2.17.1 → docling_core-2.18.0}/pyproject.toml +1 -1
- {docling_core-2.17.1 → docling_core-2.18.0}/LICENSE +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/py.typed +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/search/package.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/base.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.17.1 → docling_core-2.18.0}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.18.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -66,7 +66,7 @@ pip install docling-core
|
|
|
66
66
|
|
|
67
67
|
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
|
|
68
68
|
```bash
|
|
69
|
-
poetry install
|
|
69
|
+
poetry install --all-extras
|
|
70
70
|
```
|
|
71
71
|
|
|
72
72
|
To run the pytest suite, execute:
|
|
@@ -23,7 +23,7 @@ pip install docling-core
|
|
|
23
23
|
|
|
24
24
|
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
|
|
25
25
|
```bash
|
|
26
|
-
poetry install
|
|
26
|
+
poetry install --all-extras
|
|
27
27
|
```
|
|
28
28
|
|
|
29
29
|
To run the pytest suite, execute:
|
|
@@ -13,6 +13,7 @@ import sys
|
|
|
13
13
|
import textwrap
|
|
14
14
|
import typing
|
|
15
15
|
import warnings
|
|
16
|
+
from enum import Enum
|
|
16
17
|
from io import BytesIO
|
|
17
18
|
from pathlib import Path
|
|
18
19
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
@@ -44,13 +45,17 @@ from docling_core.types.doc import BoundingBox, Size
|
|
|
44
45
|
from docling_core.types.doc.base import ImageRefMode
|
|
45
46
|
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
|
|
46
47
|
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
47
|
-
from docling_core.types.doc.utils import
|
|
48
|
+
from docling_core.types.doc.utils import (
|
|
49
|
+
get_html_tag_with_text_direction,
|
|
50
|
+
get_text_direction,
|
|
51
|
+
relative_path,
|
|
52
|
+
)
|
|
48
53
|
|
|
49
54
|
_logger = logging.getLogger(__name__)
|
|
50
55
|
|
|
51
56
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
52
57
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
53
|
-
CURRENT_VERSION: Final = "1.
|
|
58
|
+
CURRENT_VERSION: Final = "1.1.0"
|
|
54
59
|
|
|
55
60
|
DEFAULT_EXPORT_LABELS = {
|
|
56
61
|
DocItemLabel.TITLE,
|
|
@@ -66,6 +71,8 @@ DEFAULT_EXPORT_LABELS = {
|
|
|
66
71
|
DocItemLabel.LIST_ITEM,
|
|
67
72
|
DocItemLabel.CODE,
|
|
68
73
|
DocItemLabel.REFERENCE,
|
|
74
|
+
DocItemLabel.PAGE_HEADER,
|
|
75
|
+
DocItemLabel.PAGE_FOOTER,
|
|
69
76
|
}
|
|
70
77
|
|
|
71
78
|
|
|
@@ -509,6 +516,16 @@ class ProvenanceItem(BaseModel):
|
|
|
509
516
|
charspan: Tuple[int, int]
|
|
510
517
|
|
|
511
518
|
|
|
519
|
+
class ContentLayer(str, Enum):
|
|
520
|
+
"""ContentLayer."""
|
|
521
|
+
|
|
522
|
+
BODY = "body"
|
|
523
|
+
FURNITURE = "furniture"
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
|
|
527
|
+
|
|
528
|
+
|
|
512
529
|
class NodeItem(BaseModel):
|
|
513
530
|
"""NodeItem."""
|
|
514
531
|
|
|
@@ -516,6 +533,8 @@ class NodeItem(BaseModel):
|
|
|
516
533
|
parent: Optional[RefItem] = None
|
|
517
534
|
children: List[RefItem] = []
|
|
518
535
|
|
|
536
|
+
content_layer: ContentLayer = ContentLayer.BODY
|
|
537
|
+
|
|
519
538
|
model_config = ConfigDict(extra="forbid")
|
|
520
539
|
|
|
521
540
|
def get_ref(self):
|
|
@@ -866,7 +885,9 @@ class PictureItem(FloatingItem):
|
|
|
866
885
|
|
|
867
886
|
caption_text = ""
|
|
868
887
|
if len(text) > 0:
|
|
869
|
-
caption_text =
|
|
888
|
+
caption_text = get_html_tag_with_text_direction(
|
|
889
|
+
html_tag="figcaption", text=text
|
|
890
|
+
)
|
|
870
891
|
|
|
871
892
|
default_response = f"<figure>{caption_text}</figure>"
|
|
872
893
|
|
|
@@ -1090,15 +1111,28 @@ class TableItem(FloatingItem):
|
|
|
1090
1111
|
if colspan > 1:
|
|
1091
1112
|
opening_tag += f' colspan="{colspan}"'
|
|
1092
1113
|
|
|
1114
|
+
text_dir = get_text_direction(content)
|
|
1115
|
+
if text_dir == "rtl":
|
|
1116
|
+
opening_tag += f' dir="{dir}"'
|
|
1117
|
+
|
|
1093
1118
|
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
1094
1119
|
body += "</tr>"
|
|
1095
1120
|
|
|
1121
|
+
# dir = get_text_direction(text)
|
|
1122
|
+
|
|
1096
1123
|
if len(text) > 0 and len(body) > 0:
|
|
1097
|
-
|
|
1124
|
+
caption_text = get_html_tag_with_text_direction(
|
|
1125
|
+
html_tag="caption", text=text
|
|
1126
|
+
)
|
|
1127
|
+
body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
|
|
1128
|
+
|
|
1098
1129
|
elif len(text) == 0 and len(body) > 0:
|
|
1099
1130
|
body = f"<table><tbody>{body}</tbody></table>"
|
|
1100
1131
|
elif len(text) > 0 and len(body) == 0:
|
|
1101
|
-
|
|
1132
|
+
caption_text = get_html_tag_with_text_direction(
|
|
1133
|
+
html_tag="caption", text=text
|
|
1134
|
+
)
|
|
1135
|
+
body = f"<table>{caption_text}</table>"
|
|
1102
1136
|
else:
|
|
1103
1137
|
body = "<table></table>"
|
|
1104
1138
|
|
|
@@ -1423,8 +1457,8 @@ class DoclingDocument(BaseModel):
|
|
|
1423
1457
|
# generated from synthetic data.
|
|
1424
1458
|
)
|
|
1425
1459
|
|
|
1426
|
-
furniture: GroupItem = GroupItem(
|
|
1427
|
-
name="_root_", self_ref="#/furniture"
|
|
1460
|
+
furniture: Annotated[GroupItem, Field(deprecated=True)] = GroupItem(
|
|
1461
|
+
name="_root_", self_ref="#/furniture", content_layer=ContentLayer.FURNITURE
|
|
1428
1462
|
) # List[RefItem] = []
|
|
1429
1463
|
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
|
|
1430
1464
|
|
|
@@ -1436,11 +1470,28 @@ class DoclingDocument(BaseModel):
|
|
|
1436
1470
|
|
|
1437
1471
|
pages: Dict[int, PageItem] = {} # empty as default
|
|
1438
1472
|
|
|
1473
|
+
@model_validator(mode="before")
|
|
1474
|
+
@classmethod
|
|
1475
|
+
def transform_to_content_layer(cls, data: dict) -> dict:
|
|
1476
|
+
"""transform_to_content_layer."""
|
|
1477
|
+
# Since version 1.1.0, all NodeItems carry content_layer property.
|
|
1478
|
+
# We must assign previous page_header and page_footer instances to furniture.
|
|
1479
|
+
# Note: model_validators which check on the version must use "before".
|
|
1480
|
+
if "version" in data and data["version"] == "1.0.0":
|
|
1481
|
+
for item in data.get("texts", []):
|
|
1482
|
+
if "label" in item and item["label"] in [
|
|
1483
|
+
DocItemLabel.PAGE_HEADER.value,
|
|
1484
|
+
DocItemLabel.PAGE_FOOTER.value,
|
|
1485
|
+
]:
|
|
1486
|
+
item["content_layer"] = "furniture"
|
|
1487
|
+
return data
|
|
1488
|
+
|
|
1439
1489
|
def add_group(
|
|
1440
1490
|
self,
|
|
1441
1491
|
label: Optional[GroupLabel] = None,
|
|
1442
1492
|
name: Optional[str] = None,
|
|
1443
1493
|
parent: Optional[NodeItem] = None,
|
|
1494
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1444
1495
|
) -> GroupItem:
|
|
1445
1496
|
"""add_group.
|
|
1446
1497
|
|
|
@@ -1460,6 +1511,8 @@ class DoclingDocument(BaseModel):
|
|
|
1460
1511
|
group.name = name
|
|
1461
1512
|
if label is not None:
|
|
1462
1513
|
group.label = label
|
|
1514
|
+
if content_layer:
|
|
1515
|
+
group.content_layer = content_layer
|
|
1463
1516
|
|
|
1464
1517
|
self.groups.append(group)
|
|
1465
1518
|
parent.children.append(RefItem(cref=cref))
|
|
@@ -1474,6 +1527,7 @@ class DoclingDocument(BaseModel):
|
|
|
1474
1527
|
orig: Optional[str] = None,
|
|
1475
1528
|
prov: Optional[ProvenanceItem] = None,
|
|
1476
1529
|
parent: Optional[NodeItem] = None,
|
|
1530
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1477
1531
|
):
|
|
1478
1532
|
"""add_list_item.
|
|
1479
1533
|
|
|
@@ -1504,6 +1558,8 @@ class DoclingDocument(BaseModel):
|
|
|
1504
1558
|
)
|
|
1505
1559
|
if prov:
|
|
1506
1560
|
list_item.prov.append(prov)
|
|
1561
|
+
if content_layer:
|
|
1562
|
+
list_item.content_layer = content_layer
|
|
1507
1563
|
|
|
1508
1564
|
self.texts.append(list_item)
|
|
1509
1565
|
parent.children.append(RefItem(cref=cref))
|
|
@@ -1517,6 +1573,7 @@ class DoclingDocument(BaseModel):
|
|
|
1517
1573
|
orig: Optional[str] = None,
|
|
1518
1574
|
prov: Optional[ProvenanceItem] = None,
|
|
1519
1575
|
parent: Optional[NodeItem] = None,
|
|
1576
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1520
1577
|
):
|
|
1521
1578
|
"""add_text.
|
|
1522
1579
|
|
|
@@ -1530,16 +1587,40 @@ class DoclingDocument(BaseModel):
|
|
|
1530
1587
|
# Catch a few cases that are in principle allowed
|
|
1531
1588
|
# but that will create confusion down the road
|
|
1532
1589
|
if label in [DocItemLabel.TITLE]:
|
|
1533
|
-
return self.add_title(
|
|
1590
|
+
return self.add_title(
|
|
1591
|
+
text=text,
|
|
1592
|
+
orig=orig,
|
|
1593
|
+
prov=prov,
|
|
1594
|
+
parent=parent,
|
|
1595
|
+
content_layer=content_layer,
|
|
1596
|
+
)
|
|
1534
1597
|
|
|
1535
1598
|
elif label in [DocItemLabel.LIST_ITEM]:
|
|
1536
|
-
return self.add_list_item(
|
|
1599
|
+
return self.add_list_item(
|
|
1600
|
+
text=text,
|
|
1601
|
+
orig=orig,
|
|
1602
|
+
prov=prov,
|
|
1603
|
+
parent=parent,
|
|
1604
|
+
content_layer=content_layer,
|
|
1605
|
+
)
|
|
1537
1606
|
|
|
1538
1607
|
elif label in [DocItemLabel.SECTION_HEADER]:
|
|
1539
|
-
return self.add_heading(
|
|
1608
|
+
return self.add_heading(
|
|
1609
|
+
text=text,
|
|
1610
|
+
orig=orig,
|
|
1611
|
+
prov=prov,
|
|
1612
|
+
parent=parent,
|
|
1613
|
+
content_layer=content_layer,
|
|
1614
|
+
)
|
|
1540
1615
|
|
|
1541
1616
|
elif label in [DocItemLabel.CODE]:
|
|
1542
|
-
return self.add_code(
|
|
1617
|
+
return self.add_code(
|
|
1618
|
+
text=text,
|
|
1619
|
+
orig=orig,
|
|
1620
|
+
prov=prov,
|
|
1621
|
+
parent=parent,
|
|
1622
|
+
content_layer=content_layer,
|
|
1623
|
+
)
|
|
1543
1624
|
|
|
1544
1625
|
else:
|
|
1545
1626
|
|
|
@@ -1561,6 +1642,9 @@ class DoclingDocument(BaseModel):
|
|
|
1561
1642
|
if prov:
|
|
1562
1643
|
text_item.prov.append(prov)
|
|
1563
1644
|
|
|
1645
|
+
if content_layer:
|
|
1646
|
+
text_item.content_layer = content_layer
|
|
1647
|
+
|
|
1564
1648
|
self.texts.append(text_item)
|
|
1565
1649
|
parent.children.append(RefItem(cref=cref))
|
|
1566
1650
|
|
|
@@ -1573,6 +1657,7 @@ class DoclingDocument(BaseModel):
|
|
|
1573
1657
|
prov: Optional[ProvenanceItem] = None,
|
|
1574
1658
|
parent: Optional[NodeItem] = None,
|
|
1575
1659
|
label: DocItemLabel = DocItemLabel.TABLE,
|
|
1660
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1576
1661
|
):
|
|
1577
1662
|
"""add_table.
|
|
1578
1663
|
|
|
@@ -1594,6 +1679,9 @@ class DoclingDocument(BaseModel):
|
|
|
1594
1679
|
)
|
|
1595
1680
|
if prov:
|
|
1596
1681
|
tbl_item.prov.append(prov)
|
|
1682
|
+
if content_layer:
|
|
1683
|
+
tbl_item.content_layer = content_layer
|
|
1684
|
+
|
|
1597
1685
|
if caption:
|
|
1598
1686
|
tbl_item.captions.append(caption.get_ref())
|
|
1599
1687
|
|
|
@@ -1609,6 +1697,7 @@ class DoclingDocument(BaseModel):
|
|
|
1609
1697
|
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
1610
1698
|
prov: Optional[ProvenanceItem] = None,
|
|
1611
1699
|
parent: Optional[NodeItem] = None,
|
|
1700
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1612
1701
|
):
|
|
1613
1702
|
"""add_picture.
|
|
1614
1703
|
|
|
@@ -1633,6 +1722,8 @@ class DoclingDocument(BaseModel):
|
|
|
1633
1722
|
)
|
|
1634
1723
|
if prov:
|
|
1635
1724
|
fig_item.prov.append(prov)
|
|
1725
|
+
if content_layer:
|
|
1726
|
+
fig_item.content_layer = content_layer
|
|
1636
1727
|
if caption:
|
|
1637
1728
|
fig_item.captions.append(caption.get_ref())
|
|
1638
1729
|
|
|
@@ -1647,6 +1738,7 @@ class DoclingDocument(BaseModel):
|
|
|
1647
1738
|
orig: Optional[str] = None,
|
|
1648
1739
|
prov: Optional[ProvenanceItem] = None,
|
|
1649
1740
|
parent: Optional[NodeItem] = None,
|
|
1741
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1650
1742
|
):
|
|
1651
1743
|
"""add_title.
|
|
1652
1744
|
|
|
@@ -1672,6 +1764,8 @@ class DoclingDocument(BaseModel):
|
|
|
1672
1764
|
)
|
|
1673
1765
|
if prov:
|
|
1674
1766
|
text_item.prov.append(prov)
|
|
1767
|
+
if content_layer:
|
|
1768
|
+
text_item.content_layer = content_layer
|
|
1675
1769
|
|
|
1676
1770
|
self.texts.append(text_item)
|
|
1677
1771
|
parent.children.append(RefItem(cref=cref))
|
|
@@ -1685,6 +1779,7 @@ class DoclingDocument(BaseModel):
|
|
|
1685
1779
|
orig: Optional[str] = None,
|
|
1686
1780
|
prov: Optional[ProvenanceItem] = None,
|
|
1687
1781
|
parent: Optional[NodeItem] = None,
|
|
1782
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1688
1783
|
):
|
|
1689
1784
|
"""add_code.
|
|
1690
1785
|
|
|
@@ -1710,6 +1805,8 @@ class DoclingDocument(BaseModel):
|
|
|
1710
1805
|
)
|
|
1711
1806
|
if code_language:
|
|
1712
1807
|
code_item.code_language = code_language
|
|
1808
|
+
if content_layer:
|
|
1809
|
+
code_item.content_layer = content_layer
|
|
1713
1810
|
if prov:
|
|
1714
1811
|
code_item.prov.append(prov)
|
|
1715
1812
|
|
|
@@ -1725,6 +1822,7 @@ class DoclingDocument(BaseModel):
|
|
|
1725
1822
|
level: LevelNumber = 1,
|
|
1726
1823
|
prov: Optional[ProvenanceItem] = None,
|
|
1727
1824
|
parent: Optional[NodeItem] = None,
|
|
1825
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1728
1826
|
):
|
|
1729
1827
|
"""add_heading.
|
|
1730
1828
|
|
|
@@ -1752,6 +1850,8 @@ class DoclingDocument(BaseModel):
|
|
|
1752
1850
|
)
|
|
1753
1851
|
if prov:
|
|
1754
1852
|
section_header_item.prov.append(prov)
|
|
1853
|
+
if content_layer:
|
|
1854
|
+
section_header_item.content_layer = content_layer
|
|
1755
1855
|
|
|
1756
1856
|
self.texts.append(section_header_item)
|
|
1757
1857
|
parent.children.append(RefItem(cref=cref))
|
|
@@ -1779,6 +1879,7 @@ class DoclingDocument(BaseModel):
|
|
|
1779
1879
|
with_groups: bool = False,
|
|
1780
1880
|
traverse_pictures: bool = False,
|
|
1781
1881
|
page_no: Optional[int] = None,
|
|
1882
|
+
included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
|
|
1782
1883
|
_level: int = 0, # fixed parameter, carries through the node nesting level
|
|
1783
1884
|
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
1784
1885
|
"""iterate_elements.
|
|
@@ -1795,14 +1896,22 @@ class DoclingDocument(BaseModel):
|
|
|
1795
1896
|
root = self.body
|
|
1796
1897
|
|
|
1797
1898
|
# Yield non-group items or group items when with_groups=True
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1899
|
+
|
|
1900
|
+
# Combine conditions to have a single yield point
|
|
1901
|
+
should_yield = (
|
|
1902
|
+
(not isinstance(root, GroupItem) or with_groups)
|
|
1903
|
+
and (
|
|
1904
|
+
not isinstance(root, DocItem)
|
|
1905
|
+
or (
|
|
1906
|
+
page_no is None
|
|
1907
|
+
or any(prov.page_no == page_no for prov in root.prov)
|
|
1908
|
+
)
|
|
1909
|
+
)
|
|
1910
|
+
and root.content_layer in included_content_layers
|
|
1911
|
+
)
|
|
1912
|
+
|
|
1913
|
+
if should_yield:
|
|
1914
|
+
yield root, _level
|
|
1806
1915
|
|
|
1807
1916
|
# Handle picture traversal - only traverse children if requested
|
|
1808
1917
|
if isinstance(root, PictureItem) and not traverse_pictures:
|
|
@@ -2470,17 +2579,17 @@ class DoclingDocument(BaseModel):
|
|
|
2470
2579
|
continue
|
|
2471
2580
|
|
|
2472
2581
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
2582
|
+
text_inner = _prepare_tag_content(item.text)
|
|
2583
|
+
text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
|
|
2473
2584
|
|
|
2474
|
-
text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
|
|
2475
2585
|
html_texts.append(text)
|
|
2476
2586
|
|
|
2477
2587
|
elif isinstance(item, SectionHeaderItem):
|
|
2478
2588
|
|
|
2479
2589
|
section_level: int = min(item.level + 1, 6)
|
|
2480
2590
|
|
|
2481
|
-
text = (
|
|
2482
|
-
f"
|
|
2483
|
-
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
|
|
2591
|
+
text = get_html_tag_with_text_direction(
|
|
2592
|
+
html_tag=f"h{section_level}", text=_prepare_tag_content(item.text)
|
|
2484
2593
|
)
|
|
2485
2594
|
html_texts.append(text)
|
|
2486
2595
|
|
|
@@ -2544,13 +2653,15 @@ class DoclingDocument(BaseModel):
|
|
|
2544
2653
|
)
|
|
2545
2654
|
|
|
2546
2655
|
elif isinstance(item, ListItem):
|
|
2547
|
-
|
|
2548
|
-
|
|
2656
|
+
text = get_html_tag_with_text_direction(
|
|
2657
|
+
html_tag="li", text=_prepare_tag_content(item.text)
|
|
2658
|
+
)
|
|
2549
2659
|
html_texts.append(text)
|
|
2550
2660
|
|
|
2551
2661
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2552
|
-
|
|
2553
|
-
|
|
2662
|
+
text = get_html_tag_with_text_direction(
|
|
2663
|
+
html_tag="li", text=_prepare_tag_content(item.text)
|
|
2664
|
+
)
|
|
2554
2665
|
html_texts.append(text)
|
|
2555
2666
|
|
|
2556
2667
|
elif isinstance(item, CodeItem):
|
|
@@ -2562,8 +2673,11 @@ class DoclingDocument(BaseModel):
|
|
|
2562
2673
|
|
|
2563
2674
|
elif isinstance(item, TextItem):
|
|
2564
2675
|
|
|
2565
|
-
text =
|
|
2676
|
+
text = get_html_tag_with_text_direction(
|
|
2677
|
+
html_tag="p", text=_prepare_tag_content(item.text)
|
|
2678
|
+
)
|
|
2566
2679
|
html_texts.append(text)
|
|
2680
|
+
|
|
2567
2681
|
elif isinstance(item, TableItem):
|
|
2568
2682
|
|
|
2569
2683
|
text = item.export_to_html(doc=self, add_caption=True)
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
"""Utils for document types."""
|
|
7
7
|
|
|
8
|
+
import unicodedata
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
|
|
10
11
|
|
|
@@ -46,3 +47,29 @@ def relative_path(src: Path, target: Path) -> Path:
|
|
|
46
47
|
|
|
47
48
|
# Combine and return the result
|
|
48
49
|
return Path(*up_segments, *down_segments)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_html_tag_with_text_direction(html_tag: str, text: str) -> str:
|
|
53
|
+
"""Form the HTML element with tag, text, and optional dir attribute."""
|
|
54
|
+
text_dir = get_text_direction(text)
|
|
55
|
+
|
|
56
|
+
if text_dir == "ltr":
|
|
57
|
+
return f"<{html_tag}>{text}</{html_tag}>"
|
|
58
|
+
else:
|
|
59
|
+
return f'<{html_tag} dir="{text_dir}">{text}</{html_tag}>'
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_text_direction(text: str) -> str:
|
|
63
|
+
"""Determine the text direction of a given string as LTR or RTL script."""
|
|
64
|
+
if not text:
|
|
65
|
+
return "ltr" # Default for empty input
|
|
66
|
+
|
|
67
|
+
rtl_scripts = {"R", "AL"}
|
|
68
|
+
rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text)
|
|
69
|
+
|
|
70
|
+
return (
|
|
71
|
+
"rtl"
|
|
72
|
+
if unicodedata.bidirectional(text[0]) in rtl_scripts
|
|
73
|
+
or rtl_chars > len(text) / 2
|
|
74
|
+
else "ltr"
|
|
75
|
+
)
|
|
@@ -25,7 +25,7 @@ from docling_core.types.doc import (
|
|
|
25
25
|
TableItem,
|
|
26
26
|
TextItem,
|
|
27
27
|
)
|
|
28
|
-
from docling_core.types.doc.document import GroupItem, ListItem, TableData
|
|
28
|
+
from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData
|
|
29
29
|
from docling_core.types.doc.labels import GroupLabel
|
|
30
30
|
from docling_core.types.legacy_doc.base import (
|
|
31
31
|
BaseCell,
|
|
@@ -400,7 +400,7 @@ def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: # no
|
|
|
400
400
|
doc.add_text(
|
|
401
401
|
label=DocItemLabel.PAGE_HEADER,
|
|
402
402
|
text=text_item.text,
|
|
403
|
-
|
|
403
|
+
content_layer=ContentLayer.FURNITURE,
|
|
404
404
|
)
|
|
405
405
|
|
|
406
406
|
# page footers
|
|
@@ -412,7 +412,7 @@ def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: # no
|
|
|
412
412
|
doc.add_text(
|
|
413
413
|
label=DocItemLabel.PAGE_FOOTER,
|
|
414
414
|
text=text_item.text,
|
|
415
|
-
|
|
415
|
+
content_layer=ContentLayer.FURNITURE,
|
|
416
416
|
)
|
|
417
417
|
|
|
418
418
|
# footnotes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.1 → docling_core-2.18.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.1 → docling_core-2.18.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.1 → docling_core-2.18.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.17.1 → docling_core-2.18.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|