docling-core 2.17.2__tar.gz → 2.18.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.17.2 → docling_core-2.18.0}/PKG-INFO +2 -2
- {docling_core-2.17.2 → docling_core-2.18.0}/README.md +1 -1
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/doc/document.py +105 -15
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/legacy.py +3 -3
- {docling_core-2.17.2 → docling_core-2.18.0}/pyproject.toml +1 -1
- {docling_core-2.17.2 → docling_core-2.18.0}/LICENSE +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/py.typed +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/search/package.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/base.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.18.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -66,7 +66,7 @@ pip install docling-core
|
|
|
66
66
|
|
|
67
67
|
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
|
|
68
68
|
```bash
|
|
69
|
-
poetry install
|
|
69
|
+
poetry install --all-extras
|
|
70
70
|
```
|
|
71
71
|
|
|
72
72
|
To run the pytest suite, execute:
|
|
@@ -23,7 +23,7 @@ pip install docling-core
|
|
|
23
23
|
|
|
24
24
|
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
|
|
25
25
|
```bash
|
|
26
|
-
poetry install
|
|
26
|
+
poetry install --all-extras
|
|
27
27
|
```
|
|
28
28
|
|
|
29
29
|
To run the pytest suite, execute:
|
|
@@ -13,6 +13,7 @@ import sys
|
|
|
13
13
|
import textwrap
|
|
14
14
|
import typing
|
|
15
15
|
import warnings
|
|
16
|
+
from enum import Enum
|
|
16
17
|
from io import BytesIO
|
|
17
18
|
from pathlib import Path
|
|
18
19
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
@@ -54,7 +55,7 @@ _logger = logging.getLogger(__name__)
|
|
|
54
55
|
|
|
55
56
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
56
57
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
57
|
-
CURRENT_VERSION: Final = "1.
|
|
58
|
+
CURRENT_VERSION: Final = "1.1.0"
|
|
58
59
|
|
|
59
60
|
DEFAULT_EXPORT_LABELS = {
|
|
60
61
|
DocItemLabel.TITLE,
|
|
@@ -70,6 +71,8 @@ DEFAULT_EXPORT_LABELS = {
|
|
|
70
71
|
DocItemLabel.LIST_ITEM,
|
|
71
72
|
DocItemLabel.CODE,
|
|
72
73
|
DocItemLabel.REFERENCE,
|
|
74
|
+
DocItemLabel.PAGE_HEADER,
|
|
75
|
+
DocItemLabel.PAGE_FOOTER,
|
|
73
76
|
}
|
|
74
77
|
|
|
75
78
|
|
|
@@ -513,6 +516,16 @@ class ProvenanceItem(BaseModel):
|
|
|
513
516
|
charspan: Tuple[int, int]
|
|
514
517
|
|
|
515
518
|
|
|
519
|
+
class ContentLayer(str, Enum):
|
|
520
|
+
"""ContentLayer."""
|
|
521
|
+
|
|
522
|
+
BODY = "body"
|
|
523
|
+
FURNITURE = "furniture"
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
|
|
527
|
+
|
|
528
|
+
|
|
516
529
|
class NodeItem(BaseModel):
|
|
517
530
|
"""NodeItem."""
|
|
518
531
|
|
|
@@ -520,6 +533,8 @@ class NodeItem(BaseModel):
|
|
|
520
533
|
parent: Optional[RefItem] = None
|
|
521
534
|
children: List[RefItem] = []
|
|
522
535
|
|
|
536
|
+
content_layer: ContentLayer = ContentLayer.BODY
|
|
537
|
+
|
|
523
538
|
model_config = ConfigDict(extra="forbid")
|
|
524
539
|
|
|
525
540
|
def get_ref(self):
|
|
@@ -1442,8 +1457,8 @@ class DoclingDocument(BaseModel):
|
|
|
1442
1457
|
# generated from synthetic data.
|
|
1443
1458
|
)
|
|
1444
1459
|
|
|
1445
|
-
furniture: GroupItem = GroupItem(
|
|
1446
|
-
name="_root_", self_ref="#/furniture"
|
|
1460
|
+
furniture: Annotated[GroupItem, Field(deprecated=True)] = GroupItem(
|
|
1461
|
+
name="_root_", self_ref="#/furniture", content_layer=ContentLayer.FURNITURE
|
|
1447
1462
|
) # List[RefItem] = []
|
|
1448
1463
|
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
|
|
1449
1464
|
|
|
@@ -1455,11 +1470,28 @@ class DoclingDocument(BaseModel):
|
|
|
1455
1470
|
|
|
1456
1471
|
pages: Dict[int, PageItem] = {} # empty as default
|
|
1457
1472
|
|
|
1473
|
+
@model_validator(mode="before")
|
|
1474
|
+
@classmethod
|
|
1475
|
+
def transform_to_content_layer(cls, data: dict) -> dict:
|
|
1476
|
+
"""transform_to_content_layer."""
|
|
1477
|
+
# Since version 1.1.0, all NodeItems carry content_layer property.
|
|
1478
|
+
# We must assign previous page_header and page_footer instances to furniture.
|
|
1479
|
+
# Note: model_validators which check on the version must use "before".
|
|
1480
|
+
if "version" in data and data["version"] == "1.0.0":
|
|
1481
|
+
for item in data.get("texts", []):
|
|
1482
|
+
if "label" in item and item["label"] in [
|
|
1483
|
+
DocItemLabel.PAGE_HEADER.value,
|
|
1484
|
+
DocItemLabel.PAGE_FOOTER.value,
|
|
1485
|
+
]:
|
|
1486
|
+
item["content_layer"] = "furniture"
|
|
1487
|
+
return data
|
|
1488
|
+
|
|
1458
1489
|
def add_group(
|
|
1459
1490
|
self,
|
|
1460
1491
|
label: Optional[GroupLabel] = None,
|
|
1461
1492
|
name: Optional[str] = None,
|
|
1462
1493
|
parent: Optional[NodeItem] = None,
|
|
1494
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1463
1495
|
) -> GroupItem:
|
|
1464
1496
|
"""add_group.
|
|
1465
1497
|
|
|
@@ -1479,6 +1511,8 @@ class DoclingDocument(BaseModel):
|
|
|
1479
1511
|
group.name = name
|
|
1480
1512
|
if label is not None:
|
|
1481
1513
|
group.label = label
|
|
1514
|
+
if content_layer:
|
|
1515
|
+
group.content_layer = content_layer
|
|
1482
1516
|
|
|
1483
1517
|
self.groups.append(group)
|
|
1484
1518
|
parent.children.append(RefItem(cref=cref))
|
|
@@ -1493,6 +1527,7 @@ class DoclingDocument(BaseModel):
|
|
|
1493
1527
|
orig: Optional[str] = None,
|
|
1494
1528
|
prov: Optional[ProvenanceItem] = None,
|
|
1495
1529
|
parent: Optional[NodeItem] = None,
|
|
1530
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1496
1531
|
):
|
|
1497
1532
|
"""add_list_item.
|
|
1498
1533
|
|
|
@@ -1523,6 +1558,8 @@ class DoclingDocument(BaseModel):
|
|
|
1523
1558
|
)
|
|
1524
1559
|
if prov:
|
|
1525
1560
|
list_item.prov.append(prov)
|
|
1561
|
+
if content_layer:
|
|
1562
|
+
list_item.content_layer = content_layer
|
|
1526
1563
|
|
|
1527
1564
|
self.texts.append(list_item)
|
|
1528
1565
|
parent.children.append(RefItem(cref=cref))
|
|
@@ -1536,6 +1573,7 @@ class DoclingDocument(BaseModel):
|
|
|
1536
1573
|
orig: Optional[str] = None,
|
|
1537
1574
|
prov: Optional[ProvenanceItem] = None,
|
|
1538
1575
|
parent: Optional[NodeItem] = None,
|
|
1576
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1539
1577
|
):
|
|
1540
1578
|
"""add_text.
|
|
1541
1579
|
|
|
@@ -1549,16 +1587,40 @@ class DoclingDocument(BaseModel):
|
|
|
1549
1587
|
# Catch a few cases that are in principle allowed
|
|
1550
1588
|
# but that will create confusion down the road
|
|
1551
1589
|
if label in [DocItemLabel.TITLE]:
|
|
1552
|
-
return self.add_title(
|
|
1590
|
+
return self.add_title(
|
|
1591
|
+
text=text,
|
|
1592
|
+
orig=orig,
|
|
1593
|
+
prov=prov,
|
|
1594
|
+
parent=parent,
|
|
1595
|
+
content_layer=content_layer,
|
|
1596
|
+
)
|
|
1553
1597
|
|
|
1554
1598
|
elif label in [DocItemLabel.LIST_ITEM]:
|
|
1555
|
-
return self.add_list_item(
|
|
1599
|
+
return self.add_list_item(
|
|
1600
|
+
text=text,
|
|
1601
|
+
orig=orig,
|
|
1602
|
+
prov=prov,
|
|
1603
|
+
parent=parent,
|
|
1604
|
+
content_layer=content_layer,
|
|
1605
|
+
)
|
|
1556
1606
|
|
|
1557
1607
|
elif label in [DocItemLabel.SECTION_HEADER]:
|
|
1558
|
-
return self.add_heading(
|
|
1608
|
+
return self.add_heading(
|
|
1609
|
+
text=text,
|
|
1610
|
+
orig=orig,
|
|
1611
|
+
prov=prov,
|
|
1612
|
+
parent=parent,
|
|
1613
|
+
content_layer=content_layer,
|
|
1614
|
+
)
|
|
1559
1615
|
|
|
1560
1616
|
elif label in [DocItemLabel.CODE]:
|
|
1561
|
-
return self.add_code(
|
|
1617
|
+
return self.add_code(
|
|
1618
|
+
text=text,
|
|
1619
|
+
orig=orig,
|
|
1620
|
+
prov=prov,
|
|
1621
|
+
parent=parent,
|
|
1622
|
+
content_layer=content_layer,
|
|
1623
|
+
)
|
|
1562
1624
|
|
|
1563
1625
|
else:
|
|
1564
1626
|
|
|
@@ -1580,6 +1642,9 @@ class DoclingDocument(BaseModel):
|
|
|
1580
1642
|
if prov:
|
|
1581
1643
|
text_item.prov.append(prov)
|
|
1582
1644
|
|
|
1645
|
+
if content_layer:
|
|
1646
|
+
text_item.content_layer = content_layer
|
|
1647
|
+
|
|
1583
1648
|
self.texts.append(text_item)
|
|
1584
1649
|
parent.children.append(RefItem(cref=cref))
|
|
1585
1650
|
|
|
@@ -1592,6 +1657,7 @@ class DoclingDocument(BaseModel):
|
|
|
1592
1657
|
prov: Optional[ProvenanceItem] = None,
|
|
1593
1658
|
parent: Optional[NodeItem] = None,
|
|
1594
1659
|
label: DocItemLabel = DocItemLabel.TABLE,
|
|
1660
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1595
1661
|
):
|
|
1596
1662
|
"""add_table.
|
|
1597
1663
|
|
|
@@ -1613,6 +1679,9 @@ class DoclingDocument(BaseModel):
|
|
|
1613
1679
|
)
|
|
1614
1680
|
if prov:
|
|
1615
1681
|
tbl_item.prov.append(prov)
|
|
1682
|
+
if content_layer:
|
|
1683
|
+
tbl_item.content_layer = content_layer
|
|
1684
|
+
|
|
1616
1685
|
if caption:
|
|
1617
1686
|
tbl_item.captions.append(caption.get_ref())
|
|
1618
1687
|
|
|
@@ -1628,6 +1697,7 @@ class DoclingDocument(BaseModel):
|
|
|
1628
1697
|
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
1629
1698
|
prov: Optional[ProvenanceItem] = None,
|
|
1630
1699
|
parent: Optional[NodeItem] = None,
|
|
1700
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1631
1701
|
):
|
|
1632
1702
|
"""add_picture.
|
|
1633
1703
|
|
|
@@ -1652,6 +1722,8 @@ class DoclingDocument(BaseModel):
|
|
|
1652
1722
|
)
|
|
1653
1723
|
if prov:
|
|
1654
1724
|
fig_item.prov.append(prov)
|
|
1725
|
+
if content_layer:
|
|
1726
|
+
fig_item.content_layer = content_layer
|
|
1655
1727
|
if caption:
|
|
1656
1728
|
fig_item.captions.append(caption.get_ref())
|
|
1657
1729
|
|
|
@@ -1666,6 +1738,7 @@ class DoclingDocument(BaseModel):
|
|
|
1666
1738
|
orig: Optional[str] = None,
|
|
1667
1739
|
prov: Optional[ProvenanceItem] = None,
|
|
1668
1740
|
parent: Optional[NodeItem] = None,
|
|
1741
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1669
1742
|
):
|
|
1670
1743
|
"""add_title.
|
|
1671
1744
|
|
|
@@ -1691,6 +1764,8 @@ class DoclingDocument(BaseModel):
|
|
|
1691
1764
|
)
|
|
1692
1765
|
if prov:
|
|
1693
1766
|
text_item.prov.append(prov)
|
|
1767
|
+
if content_layer:
|
|
1768
|
+
text_item.content_layer = content_layer
|
|
1694
1769
|
|
|
1695
1770
|
self.texts.append(text_item)
|
|
1696
1771
|
parent.children.append(RefItem(cref=cref))
|
|
@@ -1704,6 +1779,7 @@ class DoclingDocument(BaseModel):
|
|
|
1704
1779
|
orig: Optional[str] = None,
|
|
1705
1780
|
prov: Optional[ProvenanceItem] = None,
|
|
1706
1781
|
parent: Optional[NodeItem] = None,
|
|
1782
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1707
1783
|
):
|
|
1708
1784
|
"""add_code.
|
|
1709
1785
|
|
|
@@ -1729,6 +1805,8 @@ class DoclingDocument(BaseModel):
|
|
|
1729
1805
|
)
|
|
1730
1806
|
if code_language:
|
|
1731
1807
|
code_item.code_language = code_language
|
|
1808
|
+
if content_layer:
|
|
1809
|
+
code_item.content_layer = content_layer
|
|
1732
1810
|
if prov:
|
|
1733
1811
|
code_item.prov.append(prov)
|
|
1734
1812
|
|
|
@@ -1744,6 +1822,7 @@ class DoclingDocument(BaseModel):
|
|
|
1744
1822
|
level: LevelNumber = 1,
|
|
1745
1823
|
prov: Optional[ProvenanceItem] = None,
|
|
1746
1824
|
parent: Optional[NodeItem] = None,
|
|
1825
|
+
content_layer: Optional[ContentLayer] = None,
|
|
1747
1826
|
):
|
|
1748
1827
|
"""add_heading.
|
|
1749
1828
|
|
|
@@ -1771,6 +1850,8 @@ class DoclingDocument(BaseModel):
|
|
|
1771
1850
|
)
|
|
1772
1851
|
if prov:
|
|
1773
1852
|
section_header_item.prov.append(prov)
|
|
1853
|
+
if content_layer:
|
|
1854
|
+
section_header_item.content_layer = content_layer
|
|
1774
1855
|
|
|
1775
1856
|
self.texts.append(section_header_item)
|
|
1776
1857
|
parent.children.append(RefItem(cref=cref))
|
|
@@ -1798,6 +1879,7 @@ class DoclingDocument(BaseModel):
|
|
|
1798
1879
|
with_groups: bool = False,
|
|
1799
1880
|
traverse_pictures: bool = False,
|
|
1800
1881
|
page_no: Optional[int] = None,
|
|
1882
|
+
included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
|
|
1801
1883
|
_level: int = 0, # fixed parameter, carries through the node nesting level
|
|
1802
1884
|
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
1803
1885
|
"""iterate_elements.
|
|
@@ -1814,14 +1896,22 @@ class DoclingDocument(BaseModel):
|
|
|
1814
1896
|
root = self.body
|
|
1815
1897
|
|
|
1816
1898
|
# Yield non-group items or group items when with_groups=True
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1899
|
+
|
|
1900
|
+
# Combine conditions to have a single yield point
|
|
1901
|
+
should_yield = (
|
|
1902
|
+
(not isinstance(root, GroupItem) or with_groups)
|
|
1903
|
+
and (
|
|
1904
|
+
not isinstance(root, DocItem)
|
|
1905
|
+
or (
|
|
1906
|
+
page_no is None
|
|
1907
|
+
or any(prov.page_no == page_no for prov in root.prov)
|
|
1908
|
+
)
|
|
1909
|
+
)
|
|
1910
|
+
and root.content_layer in included_content_layers
|
|
1911
|
+
)
|
|
1912
|
+
|
|
1913
|
+
if should_yield:
|
|
1914
|
+
yield root, _level
|
|
1825
1915
|
|
|
1826
1916
|
# Handle picture traversal - only traverse children if requested
|
|
1827
1917
|
if isinstance(root, PictureItem) and not traverse_pictures:
|
|
@@ -25,7 +25,7 @@ from docling_core.types.doc import (
|
|
|
25
25
|
TableItem,
|
|
26
26
|
TextItem,
|
|
27
27
|
)
|
|
28
|
-
from docling_core.types.doc.document import GroupItem, ListItem, TableData
|
|
28
|
+
from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData
|
|
29
29
|
from docling_core.types.doc.labels import GroupLabel
|
|
30
30
|
from docling_core.types.legacy_doc.base import (
|
|
31
31
|
BaseCell,
|
|
@@ -400,7 +400,7 @@ def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: # no
|
|
|
400
400
|
doc.add_text(
|
|
401
401
|
label=DocItemLabel.PAGE_HEADER,
|
|
402
402
|
text=text_item.text,
|
|
403
|
-
|
|
403
|
+
content_layer=ContentLayer.FURNITURE,
|
|
404
404
|
)
|
|
405
405
|
|
|
406
406
|
# page footers
|
|
@@ -412,7 +412,7 @@ def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: # no
|
|
|
412
412
|
doc.add_text(
|
|
413
413
|
label=DocItemLabel.PAGE_FOOTER,
|
|
414
414
|
text=text_item.text,
|
|
415
|
-
|
|
415
|
+
content_layer=ContentLayer.FURNITURE,
|
|
416
416
|
)
|
|
417
417
|
|
|
418
418
|
# footnotes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.2 → docling_core-2.18.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.17.2 → docling_core-2.18.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.17.2 → docling_core-2.18.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|