docling-core 2.17.2__tar.gz → 2.18.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show
  1. {docling_core-2.17.2 → docling_core-2.18.0}/PKG-INFO +2 -2
  2. {docling_core-2.17.2 → docling_core-2.18.0}/README.md +1 -1
  3. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/doc/document.py +105 -15
  4. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/legacy.py +3 -3
  5. {docling_core-2.17.2 → docling_core-2.18.0}/pyproject.toml +1 -1
  6. {docling_core-2.17.2 → docling_core-2.18.0}/LICENSE +0 -0
  7. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/__init__.py +0 -0
  8. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/cli/__init__.py +0 -0
  9. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/cli/view.py +0 -0
  10. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/py.typed +0 -0
  11. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  12. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  13. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  14. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  15. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  16. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  17. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  18. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  19. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/search/__init__.py +0 -0
  20. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  21. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/search/mapping.py +0 -0
  22. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/search/meta.py +0 -0
  23. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/search/package.py +0 -0
  24. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/transforms/__init__.py +0 -0
  25. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/transforms/chunker/__init__.py +0 -0
  26. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/transforms/chunker/base.py +0 -0
  27. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  28. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  29. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/__init__.py +0 -0
  30. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/base.py +0 -0
  31. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/doc/__init__.py +0 -0
  32. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/doc/base.py +0 -0
  33. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/doc/labels.py +0 -0
  34. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/doc/tokens.py +0 -0
  35. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/doc/utils.py +0 -0
  36. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/gen/__init__.py +0 -0
  37. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/gen/generic.py +0 -0
  38. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/io/__init__.py +0 -0
  39. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  40. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/base.py +0 -0
  41. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  42. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  43. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  44. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/document.py +0 -0
  45. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  46. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/nlp/__init__.py +0 -0
  47. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/nlp/qa.py +0 -0
  48. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/nlp/qa_labels.py +0 -0
  49. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/__init__.py +0 -0
  50. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/attribute.py +0 -0
  51. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/base.py +0 -0
  52. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/predicate.py +0 -0
  53. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/record.py +0 -0
  54. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/statement.py +0 -0
  55. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/types/rec/subject.py +0 -0
  56. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/__init__.py +0 -0
  57. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/alias.py +0 -0
  58. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/file.py +0 -0
  59. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/generate_docs.py +0 -0
  60. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/generate_jsonschema.py +0 -0
  61. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/validate.py +0 -0
  62. {docling_core-2.17.2 → docling_core-2.18.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.17.2
3
+ Version: 2.18.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -66,7 +66,7 @@ pip install docling-core
66
66
 
67
67
  To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
68
68
  ```bash
69
- poetry install
69
+ poetry install --all-extras
70
70
  ```
71
71
 
72
72
  To run the pytest suite, execute:
@@ -23,7 +23,7 @@ pip install docling-core
23
23
 
24
24
  To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
25
25
  ```bash
26
- poetry install
26
+ poetry install --all-extras
27
27
  ```
28
28
 
29
29
  To run the pytest suite, execute:
@@ -13,6 +13,7 @@ import sys
13
13
  import textwrap
14
14
  import typing
15
15
  import warnings
16
+ from enum import Enum
16
17
  from io import BytesIO
17
18
  from pathlib import Path
18
19
  from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
@@ -54,7 +55,7 @@ _logger = logging.getLogger(__name__)
54
55
 
55
56
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
56
57
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
57
- CURRENT_VERSION: Final = "1.0.0"
58
+ CURRENT_VERSION: Final = "1.1.0"
58
59
 
59
60
  DEFAULT_EXPORT_LABELS = {
60
61
  DocItemLabel.TITLE,
@@ -70,6 +71,8 @@ DEFAULT_EXPORT_LABELS = {
70
71
  DocItemLabel.LIST_ITEM,
71
72
  DocItemLabel.CODE,
72
73
  DocItemLabel.REFERENCE,
74
+ DocItemLabel.PAGE_HEADER,
75
+ DocItemLabel.PAGE_FOOTER,
73
76
  }
74
77
 
75
78
 
@@ -513,6 +516,16 @@ class ProvenanceItem(BaseModel):
513
516
  charspan: Tuple[int, int]
514
517
 
515
518
 
519
+ class ContentLayer(str, Enum):
520
+ """ContentLayer."""
521
+
522
+ BODY = "body"
523
+ FURNITURE = "furniture"
524
+
525
+
526
+ DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
527
+
528
+
516
529
  class NodeItem(BaseModel):
517
530
  """NodeItem."""
518
531
 
@@ -520,6 +533,8 @@ class NodeItem(BaseModel):
520
533
  parent: Optional[RefItem] = None
521
534
  children: List[RefItem] = []
522
535
 
536
+ content_layer: ContentLayer = ContentLayer.BODY
537
+
523
538
  model_config = ConfigDict(extra="forbid")
524
539
 
525
540
  def get_ref(self):
@@ -1442,8 +1457,8 @@ class DoclingDocument(BaseModel):
1442
1457
  # generated from synthetic data.
1443
1458
  )
1444
1459
 
1445
- furniture: GroupItem = GroupItem(
1446
- name="_root_", self_ref="#/furniture"
1460
+ furniture: Annotated[GroupItem, Field(deprecated=True)] = GroupItem(
1461
+ name="_root_", self_ref="#/furniture", content_layer=ContentLayer.FURNITURE
1447
1462
  ) # List[RefItem] = []
1448
1463
  body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
1449
1464
 
@@ -1455,11 +1470,28 @@ class DoclingDocument(BaseModel):
1455
1470
 
1456
1471
  pages: Dict[int, PageItem] = {} # empty as default
1457
1472
 
1473
+ @model_validator(mode="before")
1474
+ @classmethod
1475
+ def transform_to_content_layer(cls, data: dict) -> dict:
1476
+ """transform_to_content_layer."""
1477
+ # Since version 1.1.0, all NodeItems carry content_layer property.
1478
+ # We must assign previous page_header and page_footer instances to furniture.
1479
+ # Note: model_validators which check on the version must use "before".
1480
+ if "version" in data and data["version"] == "1.0.0":
1481
+ for item in data.get("texts", []):
1482
+ if "label" in item and item["label"] in [
1483
+ DocItemLabel.PAGE_HEADER.value,
1484
+ DocItemLabel.PAGE_FOOTER.value,
1485
+ ]:
1486
+ item["content_layer"] = "furniture"
1487
+ return data
1488
+
1458
1489
  def add_group(
1459
1490
  self,
1460
1491
  label: Optional[GroupLabel] = None,
1461
1492
  name: Optional[str] = None,
1462
1493
  parent: Optional[NodeItem] = None,
1494
+ content_layer: Optional[ContentLayer] = None,
1463
1495
  ) -> GroupItem:
1464
1496
  """add_group.
1465
1497
 
@@ -1479,6 +1511,8 @@ class DoclingDocument(BaseModel):
1479
1511
  group.name = name
1480
1512
  if label is not None:
1481
1513
  group.label = label
1514
+ if content_layer:
1515
+ group.content_layer = content_layer
1482
1516
 
1483
1517
  self.groups.append(group)
1484
1518
  parent.children.append(RefItem(cref=cref))
@@ -1493,6 +1527,7 @@ class DoclingDocument(BaseModel):
1493
1527
  orig: Optional[str] = None,
1494
1528
  prov: Optional[ProvenanceItem] = None,
1495
1529
  parent: Optional[NodeItem] = None,
1530
+ content_layer: Optional[ContentLayer] = None,
1496
1531
  ):
1497
1532
  """add_list_item.
1498
1533
 
@@ -1523,6 +1558,8 @@ class DoclingDocument(BaseModel):
1523
1558
  )
1524
1559
  if prov:
1525
1560
  list_item.prov.append(prov)
1561
+ if content_layer:
1562
+ list_item.content_layer = content_layer
1526
1563
 
1527
1564
  self.texts.append(list_item)
1528
1565
  parent.children.append(RefItem(cref=cref))
@@ -1536,6 +1573,7 @@ class DoclingDocument(BaseModel):
1536
1573
  orig: Optional[str] = None,
1537
1574
  prov: Optional[ProvenanceItem] = None,
1538
1575
  parent: Optional[NodeItem] = None,
1576
+ content_layer: Optional[ContentLayer] = None,
1539
1577
  ):
1540
1578
  """add_text.
1541
1579
 
@@ -1549,16 +1587,40 @@ class DoclingDocument(BaseModel):
1549
1587
  # Catch a few cases that are in principle allowed
1550
1588
  # but that will create confusion down the road
1551
1589
  if label in [DocItemLabel.TITLE]:
1552
- return self.add_title(text=text, orig=orig, prov=prov, parent=parent)
1590
+ return self.add_title(
1591
+ text=text,
1592
+ orig=orig,
1593
+ prov=prov,
1594
+ parent=parent,
1595
+ content_layer=content_layer,
1596
+ )
1553
1597
 
1554
1598
  elif label in [DocItemLabel.LIST_ITEM]:
1555
- return self.add_list_item(text=text, orig=orig, prov=prov, parent=parent)
1599
+ return self.add_list_item(
1600
+ text=text,
1601
+ orig=orig,
1602
+ prov=prov,
1603
+ parent=parent,
1604
+ content_layer=content_layer,
1605
+ )
1556
1606
 
1557
1607
  elif label in [DocItemLabel.SECTION_HEADER]:
1558
- return self.add_heading(text=text, orig=orig, prov=prov, parent=parent)
1608
+ return self.add_heading(
1609
+ text=text,
1610
+ orig=orig,
1611
+ prov=prov,
1612
+ parent=parent,
1613
+ content_layer=content_layer,
1614
+ )
1559
1615
 
1560
1616
  elif label in [DocItemLabel.CODE]:
1561
- return self.add_code(text=text, orig=orig, prov=prov, parent=parent)
1617
+ return self.add_code(
1618
+ text=text,
1619
+ orig=orig,
1620
+ prov=prov,
1621
+ parent=parent,
1622
+ content_layer=content_layer,
1623
+ )
1562
1624
 
1563
1625
  else:
1564
1626
 
@@ -1580,6 +1642,9 @@ class DoclingDocument(BaseModel):
1580
1642
  if prov:
1581
1643
  text_item.prov.append(prov)
1582
1644
 
1645
+ if content_layer:
1646
+ text_item.content_layer = content_layer
1647
+
1583
1648
  self.texts.append(text_item)
1584
1649
  parent.children.append(RefItem(cref=cref))
1585
1650
 
@@ -1592,6 +1657,7 @@ class DoclingDocument(BaseModel):
1592
1657
  prov: Optional[ProvenanceItem] = None,
1593
1658
  parent: Optional[NodeItem] = None,
1594
1659
  label: DocItemLabel = DocItemLabel.TABLE,
1660
+ content_layer: Optional[ContentLayer] = None,
1595
1661
  ):
1596
1662
  """add_table.
1597
1663
 
@@ -1613,6 +1679,9 @@ class DoclingDocument(BaseModel):
1613
1679
  )
1614
1680
  if prov:
1615
1681
  tbl_item.prov.append(prov)
1682
+ if content_layer:
1683
+ tbl_item.content_layer = content_layer
1684
+
1616
1685
  if caption:
1617
1686
  tbl_item.captions.append(caption.get_ref())
1618
1687
 
@@ -1628,6 +1697,7 @@ class DoclingDocument(BaseModel):
1628
1697
  caption: Optional[Union[TextItem, RefItem]] = None,
1629
1698
  prov: Optional[ProvenanceItem] = None,
1630
1699
  parent: Optional[NodeItem] = None,
1700
+ content_layer: Optional[ContentLayer] = None,
1631
1701
  ):
1632
1702
  """add_picture.
1633
1703
 
@@ -1652,6 +1722,8 @@ class DoclingDocument(BaseModel):
1652
1722
  )
1653
1723
  if prov:
1654
1724
  fig_item.prov.append(prov)
1725
+ if content_layer:
1726
+ fig_item.content_layer = content_layer
1655
1727
  if caption:
1656
1728
  fig_item.captions.append(caption.get_ref())
1657
1729
 
@@ -1666,6 +1738,7 @@ class DoclingDocument(BaseModel):
1666
1738
  orig: Optional[str] = None,
1667
1739
  prov: Optional[ProvenanceItem] = None,
1668
1740
  parent: Optional[NodeItem] = None,
1741
+ content_layer: Optional[ContentLayer] = None,
1669
1742
  ):
1670
1743
  """add_title.
1671
1744
 
@@ -1691,6 +1764,8 @@ class DoclingDocument(BaseModel):
1691
1764
  )
1692
1765
  if prov:
1693
1766
  text_item.prov.append(prov)
1767
+ if content_layer:
1768
+ text_item.content_layer = content_layer
1694
1769
 
1695
1770
  self.texts.append(text_item)
1696
1771
  parent.children.append(RefItem(cref=cref))
@@ -1704,6 +1779,7 @@ class DoclingDocument(BaseModel):
1704
1779
  orig: Optional[str] = None,
1705
1780
  prov: Optional[ProvenanceItem] = None,
1706
1781
  parent: Optional[NodeItem] = None,
1782
+ content_layer: Optional[ContentLayer] = None,
1707
1783
  ):
1708
1784
  """add_code.
1709
1785
 
@@ -1729,6 +1805,8 @@ class DoclingDocument(BaseModel):
1729
1805
  )
1730
1806
  if code_language:
1731
1807
  code_item.code_language = code_language
1808
+ if content_layer:
1809
+ code_item.content_layer = content_layer
1732
1810
  if prov:
1733
1811
  code_item.prov.append(prov)
1734
1812
 
@@ -1744,6 +1822,7 @@ class DoclingDocument(BaseModel):
1744
1822
  level: LevelNumber = 1,
1745
1823
  prov: Optional[ProvenanceItem] = None,
1746
1824
  parent: Optional[NodeItem] = None,
1825
+ content_layer: Optional[ContentLayer] = None,
1747
1826
  ):
1748
1827
  """add_heading.
1749
1828
 
@@ -1771,6 +1850,8 @@ class DoclingDocument(BaseModel):
1771
1850
  )
1772
1851
  if prov:
1773
1852
  section_header_item.prov.append(prov)
1853
+ if content_layer:
1854
+ section_header_item.content_layer = content_layer
1774
1855
 
1775
1856
  self.texts.append(section_header_item)
1776
1857
  parent.children.append(RefItem(cref=cref))
@@ -1798,6 +1879,7 @@ class DoclingDocument(BaseModel):
1798
1879
  with_groups: bool = False,
1799
1880
  traverse_pictures: bool = False,
1800
1881
  page_no: Optional[int] = None,
1882
+ included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
1801
1883
  _level: int = 0, # fixed parameter, carries through the node nesting level
1802
1884
  ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
1803
1885
  """iterate_elements.
@@ -1814,14 +1896,22 @@ class DoclingDocument(BaseModel):
1814
1896
  root = self.body
1815
1897
 
1816
1898
  # Yield non-group items or group items when with_groups=True
1817
- if not isinstance(root, GroupItem) or with_groups:
1818
- if isinstance(root, DocItem):
1819
- if page_no is None or any(
1820
- prov.page_no == page_no for prov in root.prov
1821
- ):
1822
- yield root, _level
1823
- else:
1824
- yield root, _level
1899
+
1900
+ # Combine conditions to have a single yield point
1901
+ should_yield = (
1902
+ (not isinstance(root, GroupItem) or with_groups)
1903
+ and (
1904
+ not isinstance(root, DocItem)
1905
+ or (
1906
+ page_no is None
1907
+ or any(prov.page_no == page_no for prov in root.prov)
1908
+ )
1909
+ )
1910
+ and root.content_layer in included_content_layers
1911
+ )
1912
+
1913
+ if should_yield:
1914
+ yield root, _level
1825
1915
 
1826
1916
  # Handle picture traversal - only traverse children if requested
1827
1917
  if isinstance(root, PictureItem) and not traverse_pictures:
@@ -25,7 +25,7 @@ from docling_core.types.doc import (
25
25
  TableItem,
26
26
  TextItem,
27
27
  )
28
- from docling_core.types.doc.document import GroupItem, ListItem, TableData
28
+ from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData
29
29
  from docling_core.types.doc.labels import GroupLabel
30
30
  from docling_core.types.legacy_doc.base import (
31
31
  BaseCell,
@@ -400,7 +400,7 @@ def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: # no
400
400
  doc.add_text(
401
401
  label=DocItemLabel.PAGE_HEADER,
402
402
  text=text_item.text,
403
- parent=doc.furniture,
403
+ content_layer=ContentLayer.FURNITURE,
404
404
  )
405
405
 
406
406
  # page footers
@@ -412,7 +412,7 @@ def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: # no
412
412
  doc.add_text(
413
413
  label=DocItemLabel.PAGE_FOOTER,
414
414
  text=text_item.text,
415
- parent=doc.furniture,
415
+ content_layer=ContentLayer.FURNITURE,
416
416
  )
417
417
 
418
418
  # footnotes
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.17.2"
3
+ version = "2.18.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes