docling-core 2.48.4__py3-none-any.whl → 2.50.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/cli/view.py +21 -5
- docling_core/transforms/serializer/base.py +31 -0
- docling_core/transforms/serializer/common.py +180 -100
- docling_core/transforms/serializer/doctags.py +35 -20
- docling_core/transforms/serializer/html.py +78 -3
- docling_core/transforms/serializer/markdown.py +114 -5
- docling_core/types/doc/__init__.py +11 -0
- docling_core/types/doc/document.py +359 -8
- docling_core/types/doc/tokens.py +6 -0
- {docling_core-2.48.4.dist-info → docling_core-2.50.0.dist-info}/METADATA +9 -4
- {docling_core-2.48.4.dist-info → docling_core-2.50.0.dist-info}/RECORD +15 -15
- {docling_core-2.48.4.dist-info → docling_core-2.50.0.dist-info}/WHEEL +0 -0
- {docling_core-2.48.4.dist-info → docling_core-2.50.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.48.4.dist-info → docling_core-2.50.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.48.4.dist-info → docling_core-2.50.0.dist-info}/top_level.txt +0 -0
|
@@ -27,6 +27,8 @@ from pydantic import (
|
|
|
27
27
|
Field,
|
|
28
28
|
FieldSerializationInfo,
|
|
29
29
|
StringConstraints,
|
|
30
|
+
TypeAdapter,
|
|
31
|
+
ValidationError,
|
|
30
32
|
computed_field,
|
|
31
33
|
field_serializer,
|
|
32
34
|
field_validator,
|
|
@@ -60,7 +62,7 @@ _logger = logging.getLogger(__name__)
|
|
|
60
62
|
|
|
61
63
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
62
64
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
63
|
-
CURRENT_VERSION: Final = "1.
|
|
65
|
+
CURRENT_VERSION: Final = "1.8.0"
|
|
64
66
|
|
|
65
67
|
DEFAULT_EXPORT_LABELS = {
|
|
66
68
|
DocItemLabel.TITLE,
|
|
@@ -941,6 +943,156 @@ class ContentLayer(str, Enum):
|
|
|
941
943
|
DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
|
|
942
944
|
|
|
943
945
|
|
|
946
|
+
class _ExtraAllowingModel(BaseModel):
|
|
947
|
+
"""Base model allowing extra fields."""
|
|
948
|
+
|
|
949
|
+
model_config = ConfigDict(extra="allow")
|
|
950
|
+
|
|
951
|
+
def get_custom_part(self) -> dict[str, Any]:
|
|
952
|
+
"""Get the extra fields as a dictionary."""
|
|
953
|
+
return self.__pydantic_extra__ or {}
|
|
954
|
+
|
|
955
|
+
def _copy_without_extra(self) -> Self:
|
|
956
|
+
"""Create a copy without the extra fields."""
|
|
957
|
+
return self.model_validate(
|
|
958
|
+
self.model_dump(exclude={ex for ex in self.get_custom_part()})
|
|
959
|
+
)
|
|
960
|
+
|
|
961
|
+
def _check_custom_field_format(self, key: str) -> None:
|
|
962
|
+
parts = key.split(MetaUtils._META_FIELD_NAMESPACE_DELIMITER, maxsplit=1)
|
|
963
|
+
if len(parts) != 2 or (not parts[0]) or (not parts[1]):
|
|
964
|
+
raise ValueError(
|
|
965
|
+
f"Custom meta field name must be in format 'namespace__field_name' (e.g. 'my_corp__max_size'): {key}"
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
@model_validator(mode="after")
|
|
969
|
+
def _validate_field_names(self) -> Self:
|
|
970
|
+
extra_dict = self.get_custom_part()
|
|
971
|
+
for key in self.model_dump():
|
|
972
|
+
if key in extra_dict:
|
|
973
|
+
self._check_custom_field_format(key=key)
|
|
974
|
+
elif MetaUtils._META_FIELD_NAMESPACE_DELIMITER in key:
|
|
975
|
+
raise ValueError(
|
|
976
|
+
f"Standard meta field name must not contain '__': {key}"
|
|
977
|
+
)
|
|
978
|
+
|
|
979
|
+
return self
|
|
980
|
+
|
|
981
|
+
def __setattr__(self, name: str, value: Any) -> None:
|
|
982
|
+
super().__setattr__(name, value)
|
|
983
|
+
if name in self.get_custom_part():
|
|
984
|
+
self._check_custom_field_format(key=name)
|
|
985
|
+
|
|
986
|
+
def set_custom_field(self, namespace: str, name: str, value: Any) -> str:
|
|
987
|
+
"""Set a custom field and return the key."""
|
|
988
|
+
key = MetaUtils.create_meta_field_name(namespace=namespace, name=name)
|
|
989
|
+
setattr(self, key, value)
|
|
990
|
+
return key
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
class BasePrediction(_ExtraAllowingModel):
|
|
994
|
+
"""Prediction field."""
|
|
995
|
+
|
|
996
|
+
confidence: Optional[float] = Field(
|
|
997
|
+
default=None,
|
|
998
|
+
ge=0,
|
|
999
|
+
le=1,
|
|
1000
|
+
description="The confidence of the prediction.",
|
|
1001
|
+
examples=[0.9, 0.42],
|
|
1002
|
+
)
|
|
1003
|
+
created_by: Optional[str] = Field(
|
|
1004
|
+
default=None,
|
|
1005
|
+
description="The origin of the prediction.",
|
|
1006
|
+
examples=["ibm-granite/granite-docling-258M"],
|
|
1007
|
+
)
|
|
1008
|
+
|
|
1009
|
+
@field_serializer("confidence")
|
|
1010
|
+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
|
1011
|
+
return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
|
|
1012
|
+
|
|
1013
|
+
|
|
1014
|
+
class SummaryMetaField(BasePrediction):
|
|
1015
|
+
"""Summary data."""
|
|
1016
|
+
|
|
1017
|
+
text: str
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
# NOTE: must be manually kept in sync with top-level BaseMeta hierarchy fields
|
|
1021
|
+
class MetaFieldName(str, Enum):
|
|
1022
|
+
"""Standard meta field names."""
|
|
1023
|
+
|
|
1024
|
+
SUMMARY = "summary" # a summary of the tree under this node
|
|
1025
|
+
DESCRIPTION = "description" # a description of the node (e.g. for images)
|
|
1026
|
+
CLASSIFICATION = "classification" # a classification of the node content
|
|
1027
|
+
MOLECULE = "molecule" # molecule data
|
|
1028
|
+
TABULAR_CHART = "tabular_chart" # tabular chart data
|
|
1029
|
+
|
|
1030
|
+
|
|
1031
|
+
class BaseMeta(_ExtraAllowingModel):
|
|
1032
|
+
"""Base class for metadata."""
|
|
1033
|
+
|
|
1034
|
+
summary: Optional[SummaryMetaField] = None
|
|
1035
|
+
|
|
1036
|
+
|
|
1037
|
+
class DescriptionMetaField(BasePrediction):
|
|
1038
|
+
"""Description metadata field."""
|
|
1039
|
+
|
|
1040
|
+
text: str
|
|
1041
|
+
|
|
1042
|
+
|
|
1043
|
+
class PictureClassificationPrediction(BasePrediction):
|
|
1044
|
+
"""Picture classification instance."""
|
|
1045
|
+
|
|
1046
|
+
class_name: str
|
|
1047
|
+
|
|
1048
|
+
|
|
1049
|
+
class PictureClassificationMetaField(_ExtraAllowingModel):
|
|
1050
|
+
"""Picture classification metadata field."""
|
|
1051
|
+
|
|
1052
|
+
predictions: list[PictureClassificationPrediction] = Field(
|
|
1053
|
+
default_factory=list, min_length=1
|
|
1054
|
+
)
|
|
1055
|
+
|
|
1056
|
+
def get_main_prediction(self) -> PictureClassificationPrediction:
|
|
1057
|
+
"""Get prediction with highest confidence (if confidence not available, first is used by convention)."""
|
|
1058
|
+
max_conf_pos: Optional[int] = None
|
|
1059
|
+
max_conf: Optional[float] = None
|
|
1060
|
+
for i, pred in enumerate(self.predictions):
|
|
1061
|
+
if pred.confidence is not None and (
|
|
1062
|
+
max_conf is None or pred.confidence > max_conf
|
|
1063
|
+
):
|
|
1064
|
+
max_conf_pos = i
|
|
1065
|
+
max_conf = pred.confidence
|
|
1066
|
+
return self.predictions[max_conf_pos if max_conf_pos is not None else 0]
|
|
1067
|
+
|
|
1068
|
+
|
|
1069
|
+
class MoleculeMetaField(BasePrediction):
|
|
1070
|
+
"""Molecule metadata field."""
|
|
1071
|
+
|
|
1072
|
+
smi: str = Field(description="The SMILES representation of the molecule.")
|
|
1073
|
+
|
|
1074
|
+
|
|
1075
|
+
class TabularChartMetaField(BasePrediction):
|
|
1076
|
+
"""Tabular chart metadata field."""
|
|
1077
|
+
|
|
1078
|
+
title: Optional[str] = None
|
|
1079
|
+
chart_data: TableData
|
|
1080
|
+
|
|
1081
|
+
|
|
1082
|
+
class FloatingMeta(BaseMeta):
|
|
1083
|
+
"""Metadata model for floating."""
|
|
1084
|
+
|
|
1085
|
+
description: Optional[DescriptionMetaField] = None
|
|
1086
|
+
|
|
1087
|
+
|
|
1088
|
+
class PictureMeta(FloatingMeta):
|
|
1089
|
+
"""Metadata model for pictures."""
|
|
1090
|
+
|
|
1091
|
+
classification: Optional[PictureClassificationMetaField] = None
|
|
1092
|
+
molecule: Optional[MoleculeMetaField] = None
|
|
1093
|
+
tabular_chart: Optional[TabularChartMetaField] = None
|
|
1094
|
+
|
|
1095
|
+
|
|
944
1096
|
class NodeItem(BaseModel):
|
|
945
1097
|
"""NodeItem."""
|
|
946
1098
|
|
|
@@ -952,6 +1104,8 @@ class NodeItem(BaseModel):
|
|
|
952
1104
|
|
|
953
1105
|
model_config = ConfigDict(extra="forbid")
|
|
954
1106
|
|
|
1107
|
+
meta: Optional[BaseMeta] = None
|
|
1108
|
+
|
|
955
1109
|
def get_ref(self) -> RefItem:
|
|
956
1110
|
"""get_ref."""
|
|
957
1111
|
return RefItem(cref=self.self_ref)
|
|
@@ -1312,6 +1466,8 @@ class ListItem(TextItem):
|
|
|
1312
1466
|
class FloatingItem(DocItem):
|
|
1313
1467
|
"""FloatingItem."""
|
|
1314
1468
|
|
|
1469
|
+
meta: Optional[FloatingMeta] = None
|
|
1470
|
+
|
|
1315
1471
|
captions: List[RefItem] = []
|
|
1316
1472
|
references: List[RefItem] = []
|
|
1317
1473
|
footnotes: List[RefItem] = []
|
|
@@ -1399,6 +1555,33 @@ class FormulaItem(TextItem):
|
|
|
1399
1555
|
)
|
|
1400
1556
|
|
|
1401
1557
|
|
|
1558
|
+
class MetaUtils:
|
|
1559
|
+
"""Metadata-related utilities."""
|
|
1560
|
+
|
|
1561
|
+
_META_FIELD_NAMESPACE_DELIMITER: Final = "__"
|
|
1562
|
+
_META_FIELD_LEGACY_NAMESPACE: Final = "docling_legacy"
|
|
1563
|
+
|
|
1564
|
+
@classmethod
|
|
1565
|
+
def create_meta_field_name(
|
|
1566
|
+
cls,
|
|
1567
|
+
*,
|
|
1568
|
+
namespace: str,
|
|
1569
|
+
name: str,
|
|
1570
|
+
) -> str:
|
|
1571
|
+
"""Create a meta field name."""
|
|
1572
|
+
return f"{namespace}{cls._META_FIELD_NAMESPACE_DELIMITER}{name}"
|
|
1573
|
+
|
|
1574
|
+
@classmethod
|
|
1575
|
+
def _create_migrated_meta_field_name(
|
|
1576
|
+
cls,
|
|
1577
|
+
*,
|
|
1578
|
+
name: str,
|
|
1579
|
+
) -> str:
|
|
1580
|
+
return cls.create_meta_field_name(
|
|
1581
|
+
namespace=cls._META_FIELD_LEGACY_NAMESPACE, name=name
|
|
1582
|
+
)
|
|
1583
|
+
|
|
1584
|
+
|
|
1402
1585
|
class PictureItem(FloatingItem):
|
|
1403
1586
|
"""PictureItem."""
|
|
1404
1587
|
|
|
@@ -1406,7 +1589,94 @@ class PictureItem(FloatingItem):
|
|
|
1406
1589
|
DocItemLabel.PICTURE
|
|
1407
1590
|
)
|
|
1408
1591
|
|
|
1409
|
-
|
|
1592
|
+
meta: Optional[PictureMeta] = None
|
|
1593
|
+
annotations: Annotated[
|
|
1594
|
+
List[PictureDataType],
|
|
1595
|
+
deprecated("Field `annotations` is deprecated; use `meta` instead."),
|
|
1596
|
+
] = []
|
|
1597
|
+
|
|
1598
|
+
@model_validator(mode="before")
|
|
1599
|
+
@classmethod
|
|
1600
|
+
def _migrate_annotations_to_meta(cls, data: Any) -> Any:
|
|
1601
|
+
"""Migrate the `annotations` field to `meta`."""
|
|
1602
|
+
if isinstance(data, dict) and (annotations := data.get("annotations")):
|
|
1603
|
+
_logger.warning(
|
|
1604
|
+
"Migrating deprecated `annotations` to `meta`; this will be removed in the future. "
|
|
1605
|
+
"Note that only the first available instance of each annotation type will be migrated."
|
|
1606
|
+
)
|
|
1607
|
+
for raw_ann in annotations:
|
|
1608
|
+
# migrate annotations to meta
|
|
1609
|
+
|
|
1610
|
+
try:
|
|
1611
|
+
ann: PictureDataType = TypeAdapter(PictureDataType).validate_python(
|
|
1612
|
+
raw_ann
|
|
1613
|
+
)
|
|
1614
|
+
except ValidationError as e:
|
|
1615
|
+
raise e
|
|
1616
|
+
|
|
1617
|
+
# ensure meta field is present
|
|
1618
|
+
data.setdefault("meta", {})
|
|
1619
|
+
|
|
1620
|
+
if isinstance(ann, PictureClassificationData):
|
|
1621
|
+
data["meta"].setdefault(
|
|
1622
|
+
MetaFieldName.CLASSIFICATION.value,
|
|
1623
|
+
PictureClassificationMetaField(
|
|
1624
|
+
predictions=[
|
|
1625
|
+
PictureClassificationPrediction(
|
|
1626
|
+
class_name=pred.class_name,
|
|
1627
|
+
confidence=pred.confidence,
|
|
1628
|
+
created_by=ann.provenance,
|
|
1629
|
+
)
|
|
1630
|
+
for pred in ann.predicted_classes
|
|
1631
|
+
],
|
|
1632
|
+
).model_dump(mode="json"),
|
|
1633
|
+
)
|
|
1634
|
+
elif isinstance(ann, DescriptionAnnotation):
|
|
1635
|
+
data["meta"].setdefault(
|
|
1636
|
+
MetaFieldName.DESCRIPTION.value,
|
|
1637
|
+
DescriptionMetaField(
|
|
1638
|
+
text=ann.text,
|
|
1639
|
+
created_by=ann.provenance,
|
|
1640
|
+
).model_dump(mode="json"),
|
|
1641
|
+
)
|
|
1642
|
+
elif isinstance(ann, PictureMoleculeData):
|
|
1643
|
+
data["meta"].setdefault(
|
|
1644
|
+
MetaFieldName.MOLECULE.value,
|
|
1645
|
+
MoleculeMetaField(
|
|
1646
|
+
smi=ann.smi,
|
|
1647
|
+
confidence=ann.confidence,
|
|
1648
|
+
created_by=ann.provenance,
|
|
1649
|
+
**{
|
|
1650
|
+
MetaUtils._create_migrated_meta_field_name(
|
|
1651
|
+
name="segmentation"
|
|
1652
|
+
): ann.segmentation,
|
|
1653
|
+
MetaUtils._create_migrated_meta_field_name(
|
|
1654
|
+
name="class_name"
|
|
1655
|
+
): ann.class_name,
|
|
1656
|
+
},
|
|
1657
|
+
).model_dump(mode="json"),
|
|
1658
|
+
)
|
|
1659
|
+
elif isinstance(ann, PictureTabularChartData):
|
|
1660
|
+
data["meta"].setdefault(
|
|
1661
|
+
MetaFieldName.TABULAR_CHART.value,
|
|
1662
|
+
TabularChartMetaField(
|
|
1663
|
+
title=ann.title,
|
|
1664
|
+
chart_data=ann.chart_data,
|
|
1665
|
+
).model_dump(mode="json"),
|
|
1666
|
+
)
|
|
1667
|
+
elif isinstance(ann, MiscAnnotation):
|
|
1668
|
+
data["meta"].setdefault(
|
|
1669
|
+
MetaUtils._create_migrated_meta_field_name(name=ann.kind),
|
|
1670
|
+
ann.content,
|
|
1671
|
+
)
|
|
1672
|
+
else:
|
|
1673
|
+
# fall back to reusing original annotation type name (in namespaced format)
|
|
1674
|
+
data["meta"].setdefault(
|
|
1675
|
+
MetaUtils._create_migrated_meta_field_name(name=ann.kind),
|
|
1676
|
+
ann.model_dump(mode="json"),
|
|
1677
|
+
)
|
|
1678
|
+
|
|
1679
|
+
return data
|
|
1410
1680
|
|
|
1411
1681
|
# Convert the image to Base64
|
|
1412
1682
|
def _image_to_base64(self, pil_image, format="PNG"):
|
|
@@ -1554,7 +1824,54 @@ class TableItem(FloatingItem):
|
|
|
1554
1824
|
DocItemLabel.TABLE,
|
|
1555
1825
|
] = DocItemLabel.TABLE
|
|
1556
1826
|
|
|
1557
|
-
annotations:
|
|
1827
|
+
annotations: Annotated[
|
|
1828
|
+
List[TableAnnotationType],
|
|
1829
|
+
deprecated("Field `annotations` is deprecated; use `meta` instead."),
|
|
1830
|
+
] = []
|
|
1831
|
+
|
|
1832
|
+
@model_validator(mode="before")
|
|
1833
|
+
@classmethod
|
|
1834
|
+
def migrate_annotations_to_meta(cls, data: Any) -> Any:
|
|
1835
|
+
"""Migrate the `annotations` field to `meta`."""
|
|
1836
|
+
if isinstance(data, dict) and (annotations := data.get("annotations")):
|
|
1837
|
+
_logger.warning(
|
|
1838
|
+
"Migrating deprecated `annotations` to `meta`; this will be removed in the future. "
|
|
1839
|
+
"Note that only the first available instance of each annotation type will be migrated."
|
|
1840
|
+
)
|
|
1841
|
+
for raw_ann in annotations:
|
|
1842
|
+
# migrate annotations to meta
|
|
1843
|
+
|
|
1844
|
+
try:
|
|
1845
|
+
ann: TableAnnotationType = TypeAdapter(
|
|
1846
|
+
TableAnnotationType
|
|
1847
|
+
).validate_python(raw_ann)
|
|
1848
|
+
except ValidationError as e:
|
|
1849
|
+
raise e
|
|
1850
|
+
|
|
1851
|
+
# ensure meta field is present
|
|
1852
|
+
data.setdefault("meta", {})
|
|
1853
|
+
|
|
1854
|
+
if isinstance(ann, DescriptionAnnotation):
|
|
1855
|
+
data["meta"].setdefault(
|
|
1856
|
+
MetaFieldName.DESCRIPTION.value,
|
|
1857
|
+
DescriptionMetaField(
|
|
1858
|
+
text=ann.text,
|
|
1859
|
+
created_by=ann.provenance,
|
|
1860
|
+
).model_dump(mode="json"),
|
|
1861
|
+
)
|
|
1862
|
+
elif isinstance(ann, MiscAnnotation):
|
|
1863
|
+
data["meta"].setdefault(
|
|
1864
|
+
MetaUtils._create_migrated_meta_field_name(name=ann.kind),
|
|
1865
|
+
ann.content,
|
|
1866
|
+
)
|
|
1867
|
+
else:
|
|
1868
|
+
# fall back to reusing original annotation type name (in namespaced format)
|
|
1869
|
+
data["meta"].setdefault(
|
|
1870
|
+
MetaUtils._create_migrated_meta_field_name(name=ann.kind),
|
|
1871
|
+
ann.model_dump(mode="json"),
|
|
1872
|
+
)
|
|
1873
|
+
|
|
1874
|
+
return data
|
|
1558
1875
|
|
|
1559
1876
|
def export_to_dataframe(
|
|
1560
1877
|
self, doc: Optional["DoclingDocument"] = None
|
|
@@ -2267,7 +2584,7 @@ class DoclingDocument(BaseModel):
|
|
|
2267
2584
|
if not success:
|
|
2268
2585
|
del to_be_deleted_items[stack_]
|
|
2269
2586
|
else:
|
|
2270
|
-
_logger.
|
|
2587
|
+
_logger.debug(f"deleted item in tree at stack: {stack_} => {ref_}")
|
|
2271
2588
|
|
|
2272
2589
|
# Create a new lookup of the orphans:
|
|
2273
2590
|
# dict of item_label (`texts`, `tables`, ...) to a
|
|
@@ -4396,6 +4713,9 @@ class DoclingDocument(BaseModel):
|
|
|
4396
4713
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
4397
4714
|
page_break_placeholder: Optional[str] = None,
|
|
4398
4715
|
include_annotations: bool = True,
|
|
4716
|
+
*,
|
|
4717
|
+
mark_meta: bool = False,
|
|
4718
|
+
use_legacy_annotations: bool = False,
|
|
4399
4719
|
):
|
|
4400
4720
|
"""Save to markdown."""
|
|
4401
4721
|
if isinstance(filename, str):
|
|
@@ -4425,6 +4745,8 @@ class DoclingDocument(BaseModel):
|
|
|
4425
4745
|
included_content_layers=included_content_layers,
|
|
4426
4746
|
page_break_placeholder=page_break_placeholder,
|
|
4427
4747
|
include_annotations=include_annotations,
|
|
4748
|
+
use_legacy_annotations=use_legacy_annotations,
|
|
4749
|
+
mark_meta=mark_meta,
|
|
4428
4750
|
)
|
|
4429
4751
|
|
|
4430
4752
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
@@ -4449,6 +4771,11 @@ class DoclingDocument(BaseModel):
|
|
|
4449
4771
|
page_break_placeholder: Optional[str] = None, # e.g. "<!-- page break -->",
|
|
4450
4772
|
include_annotations: bool = True,
|
|
4451
4773
|
mark_annotations: bool = False,
|
|
4774
|
+
*,
|
|
4775
|
+
use_legacy_annotations: bool = False,
|
|
4776
|
+
allowed_meta_names: Optional[set[str]] = None,
|
|
4777
|
+
blocked_meta_names: Optional[set[str]] = None,
|
|
4778
|
+
mark_meta: bool = False,
|
|
4452
4779
|
) -> str:
|
|
4453
4780
|
r"""Serialize to Markdown.
|
|
4454
4781
|
|
|
@@ -4494,8 +4821,18 @@ class DoclingDocument(BaseModel):
|
|
|
4494
4821
|
:param mark_annotations: bool: Whether to mark annotations in the export; only
|
|
4495
4822
|
relevant if include_annotations is True. (Default value = False).
|
|
4496
4823
|
:type mark_annotations: bool = False
|
|
4824
|
+
:param use_legacy_annotations: bool: Whether to use legacy annotation serialization.
|
|
4825
|
+
(Default value = False).
|
|
4826
|
+
:type use_legacy_annotations: bool = False
|
|
4827
|
+
:param mark_meta: bool: Whether to mark meta in the export; only
|
|
4828
|
+
relevant if use_legacy_annotations is False. (Default value = False).
|
|
4829
|
+
:type mark_meta: bool = False
|
|
4497
4830
|
:returns: The exported Markdown representation.
|
|
4498
4831
|
:rtype: str
|
|
4832
|
+
:param allowed_meta_names: Optional[set[str]]: Meta names to allow; None means all meta names are allowed.
|
|
4833
|
+
:type allowed_meta_names: Optional[set[str]] = None
|
|
4834
|
+
:param blocked_meta_names: Optional[set[str]]: Meta names to block; takes precedence over allowed_meta_names.
|
|
4835
|
+
:type blocked_meta_names: Optional[set[str]] = None
|
|
4499
4836
|
"""
|
|
4500
4837
|
from docling_core.transforms.serializer.markdown import (
|
|
4501
4838
|
MarkdownDocSerializer,
|
|
@@ -4524,7 +4861,11 @@ class DoclingDocument(BaseModel):
|
|
|
4524
4861
|
indent=indent,
|
|
4525
4862
|
wrap_width=text_width if text_width > 0 else None,
|
|
4526
4863
|
page_break_placeholder=page_break_placeholder,
|
|
4864
|
+
mark_meta=mark_meta,
|
|
4527
4865
|
include_annotations=include_annotations,
|
|
4866
|
+
use_legacy_annotations=use_legacy_annotations,
|
|
4867
|
+
allowed_meta_names=allowed_meta_names,
|
|
4868
|
+
blocked_meta_names=blocked_meta_names or set(),
|
|
4528
4869
|
mark_annotations=mark_annotations,
|
|
4529
4870
|
),
|
|
4530
4871
|
)
|
|
@@ -5530,16 +5871,17 @@ class DoclingDocument(BaseModel):
|
|
|
5530
5871
|
return CURRENT_VERSION
|
|
5531
5872
|
|
|
5532
5873
|
@model_validator(mode="after") # type: ignore
|
|
5533
|
-
|
|
5534
|
-
def validate_document(cls, d: "DoclingDocument"):
|
|
5874
|
+
def validate_document(self) -> Self:
|
|
5535
5875
|
"""validate_document."""
|
|
5536
5876
|
with warnings.catch_warnings():
|
|
5537
5877
|
# ignore warning from deprecated furniture
|
|
5538
5878
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
5539
|
-
if not
|
|
5879
|
+
if not self.validate_tree(self.body) or not self.validate_tree(
|
|
5880
|
+
self.furniture
|
|
5881
|
+
):
|
|
5540
5882
|
raise ValueError("Document hierachy is inconsistent.")
|
|
5541
5883
|
|
|
5542
|
-
return
|
|
5884
|
+
return self
|
|
5543
5885
|
|
|
5544
5886
|
@model_validator(mode="after")
|
|
5545
5887
|
def validate_misplaced_list_items(self):
|
|
@@ -5746,6 +6088,13 @@ class DoclingDocument(BaseModel):
|
|
|
5746
6088
|
return res_doc
|
|
5747
6089
|
|
|
5748
6090
|
def _validate_rules(self):
|
|
6091
|
+
|
|
6092
|
+
def validate_furniture(doc: DoclingDocument):
|
|
6093
|
+
if doc.furniture.children:
|
|
6094
|
+
raise ValueError(
|
|
6095
|
+
f"Deprecated furniture node {doc.furniture.self_ref} has children"
|
|
6096
|
+
)
|
|
6097
|
+
|
|
5749
6098
|
def validate_list_group(doc: DoclingDocument, item: ListGroup):
|
|
5750
6099
|
for ref in item.children:
|
|
5751
6100
|
child = ref.resolve(doc)
|
|
@@ -5768,6 +6117,8 @@ class DoclingDocument(BaseModel):
|
|
|
5768
6117
|
): # tolerate empty body, but not other groups
|
|
5769
6118
|
raise ValueError(f"Group {item.self_ref} has no children")
|
|
5770
6119
|
|
|
6120
|
+
validate_furniture(self)
|
|
6121
|
+
|
|
5771
6122
|
for item, _ in self.iterate_items(
|
|
5772
6123
|
with_groups=True,
|
|
5773
6124
|
traverse_pictures=True,
|
docling_core/types/doc/tokens.py
CHANGED
|
@@ -55,6 +55,7 @@ class _PictureClassificationToken(str, Enum):
|
|
|
55
55
|
PICTURE_GROUP = "<picture_group>"
|
|
56
56
|
|
|
57
57
|
# General
|
|
58
|
+
CHART = "<chart>"
|
|
58
59
|
PIE_CHART = "<pie_chart>"
|
|
59
60
|
BAR_CHART = "<bar_chart>"
|
|
60
61
|
STACKED_BAR_CHART = "<stacked_bar_chart>"
|
|
@@ -63,8 +64,12 @@ class _PictureClassificationToken(str, Enum):
|
|
|
63
64
|
SCATTER_CHART = "<scatter_chart>"
|
|
64
65
|
HEATMAP = "<heatmap>"
|
|
65
66
|
REMOTE_SENSING = "<remote_sensing>"
|
|
67
|
+
INFOGRAPHIC = "<infographic>"
|
|
68
|
+
DECORATION = "<decoration>"
|
|
69
|
+
ILLUSTRATION = "<illustration>"
|
|
66
70
|
|
|
67
71
|
NATURAL_IMAGE = "<natural_image>"
|
|
72
|
+
PERSON = "<person>"
|
|
68
73
|
|
|
69
74
|
# Chemistry
|
|
70
75
|
MOLECULAR_STRUCTURE = "<chemistry_molecular_structure>"
|
|
@@ -78,6 +83,7 @@ class _PictureClassificationToken(str, Enum):
|
|
|
78
83
|
QR_CODE = "<qr_code>"
|
|
79
84
|
BAR_CODE = "<bar_code>"
|
|
80
85
|
SCREENSHOT = "<screenshot>"
|
|
86
|
+
UI_ELEMENT = "<ui_element>"
|
|
81
87
|
|
|
82
88
|
# Geology/Geography
|
|
83
89
|
GEOGRAPHIC_MAP = "<map>"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.50.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -15,12 +15,17 @@ Classifier: Intended Audience :: Developers
|
|
|
15
15
|
Classifier: Intended Audience :: Science/Research
|
|
16
16
|
Classifier: Natural Language :: English
|
|
17
17
|
Classifier: Operating System :: OS Independent
|
|
18
|
-
Classifier: Programming Language :: Python :: 3
|
|
19
18
|
Classifier: Topic :: Database
|
|
20
19
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
21
20
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
21
|
Classifier: Typing :: Typed
|
|
23
22
|
Classifier: Programming Language :: Python :: 3
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
26
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
27
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
28
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
24
29
|
Requires-Python: <4.0,>=3.9
|
|
25
30
|
Description-Content-Type: text/markdown
|
|
26
31
|
License-File: LICENSE
|
|
@@ -29,7 +34,7 @@ Requires-Dist: pydantic!=2.10.0,!=2.10.1,!=2.10.2,<3.0.0,>=2.6.0
|
|
|
29
34
|
Requires-Dist: jsonref<2.0.0,>=1.1.0
|
|
30
35
|
Requires-Dist: tabulate<0.10.0,>=0.9.0
|
|
31
36
|
Requires-Dist: pandas<3.0.0,>=2.1.4
|
|
32
|
-
Requires-Dist: pillow<
|
|
37
|
+
Requires-Dist: pillow<13.0.0,>=10.0.0
|
|
33
38
|
Requires-Dist: pyyaml<7.0.0,>=5.1
|
|
34
39
|
Requires-Dist: typing-extensions<5.0.0,>=4.12.2
|
|
35
40
|
Requires-Dist: typer<0.20.0,>=0.12.5
|
|
@@ -39,7 +44,7 @@ Requires-Dist: semchunk<3.0.0,>=2.2.0; extra == "chunking"
|
|
|
39
44
|
Requires-Dist: transformers<5.0.0,>=4.34.0; extra == "chunking"
|
|
40
45
|
Provides-Extra: chunking-openai
|
|
41
46
|
Requires-Dist: semchunk; extra == "chunking-openai"
|
|
42
|
-
Requires-Dist: tiktoken<0.
|
|
47
|
+
Requires-Dist: tiktoken<0.13.0,>=0.9.0; extra == "chunking-openai"
|
|
43
48
|
Dynamic: license-file
|
|
44
49
|
|
|
45
50
|
# Docling Core
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
docling_core/__init__.py,sha256=D0afxif-BMUrgx2cYk1cwxiwATRYaGXsIMk_z4nw1Vs,90
|
|
2
2
|
docling_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
docling_core/cli/__init__.py,sha256=C63yWifzpA0IV7YWDatpAdrhoV8zjqxAKv0xMf09VdM,19
|
|
4
|
-
docling_core/cli/view.py,sha256
|
|
4
|
+
docling_core/cli/view.py,sha256=-WlYrybebqKUFyyXA5OAhFgDtgSzBh9zEAnvZZpnjaE,2232
|
|
5
5
|
docling_core/experimental/__init__.py,sha256=XnAVSUHbA6OFhNSpoYqSD3u83-xVaUaki1DIKFw69Ew,99
|
|
6
6
|
docling_core/resources/schemas/doc/ANN.json,sha256=04U5j-PU9m5w7IagJ_rHcAx7qUtLkUuaWZO9GuYHnTA,4202
|
|
7
7
|
docling_core/resources/schemas/doc/DOC.json,sha256=9tVKpCqDGGq3074Nn5qlUCdTN-5k1Q0ri_scJblwnLE,6686
|
|
@@ -27,12 +27,12 @@ docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP
|
|
|
27
27
|
docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZw3SBCoqJHM2Ihb65eiM29O9BR6o,2506
|
|
28
28
|
docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
|
|
29
29
|
docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
|
|
30
|
-
docling_core/transforms/serializer/base.py,sha256=
|
|
31
|
-
docling_core/transforms/serializer/common.py,sha256=
|
|
32
|
-
docling_core/transforms/serializer/doctags.py,sha256=
|
|
33
|
-
docling_core/transforms/serializer/html.py,sha256=
|
|
30
|
+
docling_core/transforms/serializer/base.py,sha256=aSzn2_2wTmty_gLVrOfHINHRU4HT473e_ldmop-CV2A,8092
|
|
31
|
+
docling_core/transforms/serializer/common.py,sha256=GvgArh-y9dl1j651MF2BT4psVn2PWnkWxczu13WuKEI,22202
|
|
32
|
+
docling_core/transforms/serializer/doctags.py,sha256=EpvIjGdsl1DoD-xgNjui6w4F9qbVwm3uCE3hB0CEZ-I,21383
|
|
33
|
+
docling_core/transforms/serializer/html.py,sha256=hIjqEtKxI0t2a_Av9IZKK5tTa3GL_-KPovoGnX2cxa0,41009
|
|
34
34
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
35
|
-
docling_core/transforms/serializer/markdown.py,sha256=
|
|
35
|
+
docling_core/transforms/serializer/markdown.py,sha256=pFvcpEhMML9HugtiZWSRbzmvIe2zeHep9giXTqSWXo4,28143
|
|
36
36
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
37
37
|
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
38
38
|
docling_core/transforms/visualizer/key_value_visualizer.py,sha256=fp7nFLy4flOSiavdRgg5y1Mu7WVLIDGh1zEHsq8kgVM,8979
|
|
@@ -41,12 +41,12 @@ docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=muqmaxOBao
|
|
|
41
41
|
docling_core/transforms/visualizer/table_visualizer.py,sha256=iJPjk-XQSSCH3oujcjPMz-redAwNNHseZ41lFyd-u3k,8097
|
|
42
42
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
43
43
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
44
|
-
docling_core/types/doc/__init__.py,sha256=
|
|
44
|
+
docling_core/types/doc/__init__.py,sha256=V5M_Oi2ALsvA3Z6K3bg8x3aHzDzXl_ErSn0AiOZCJNM,1915
|
|
45
45
|
docling_core/types/doc/base.py,sha256=i98y4IF250adR-8BSS374K90fwfwG-vBfWh14tLC5Cs,15906
|
|
46
|
-
docling_core/types/doc/document.py,sha256=
|
|
46
|
+
docling_core/types/doc/document.py,sha256=2UMPfEQIpNxxulamm6fbK4pewohpCS23-O_H2RGSmvI,216223
|
|
47
47
|
docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
|
|
48
48
|
docling_core/types/doc/page.py,sha256=35h1xdtCM3-AaN8Dim9jDseZIiw-3GxpB-ofF-H2rQQ,41878
|
|
49
|
-
docling_core/types/doc/tokens.py,sha256=
|
|
49
|
+
docling_core/types/doc/tokens.py,sha256=MkmclSjfqoXyiefMTGauAyCRx3JTtvbOn5-qx_-i4JE,9458
|
|
50
50
|
docling_core/types/doc/utils.py,sha256=wKC9SJgS4ZKdoYPAlNuRyncv9RIEewzVCBmwbUmbA6E,9106
|
|
51
51
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
52
52
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
@@ -76,9 +76,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
76
76
|
docling_core/utils/legacy.py,sha256=G7ed8fkBpIO8hG3DKEY83cHsrKJHyvDst_1jSdgBXMI,24406
|
|
77
77
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
78
78
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
81
|
-
docling_core-2.
|
|
82
|
-
docling_core-2.
|
|
83
|
-
docling_core-2.
|
|
84
|
-
docling_core-2.
|
|
79
|
+
docling_core-2.50.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
80
|
+
docling_core-2.50.0.dist-info/METADATA,sha256=CcX98hyuxrAftDSKBirRIjPlYs2GM5uF60T5loEiYLE,6710
|
|
81
|
+
docling_core-2.50.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
82
|
+
docling_core-2.50.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
83
|
+
docling_core-2.50.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
84
|
+
docling_core-2.50.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|