docling-core 2.39.0__py3-none-any.whl → 2.41.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/types/doc/base.py +21 -1
- docling_core/types/doc/document.py +43 -7
- docling_core/types/doc/page.py +21 -4
- {docling_core-2.39.0.dist-info → docling_core-2.41.0.dist-info}/METADATA +1 -1
- {docling_core-2.39.0.dist-info → docling_core-2.41.0.dist-info}/RECORD +9 -9
- {docling_core-2.39.0.dist-info → docling_core-2.41.0.dist-info}/WHEEL +0 -0
- {docling_core-2.39.0.dist-info → docling_core-2.41.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.39.0.dist-info → docling_core-2.41.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.39.0.dist-info → docling_core-2.41.0.dist-info}/top_level.txt +0 -0
docling_core/types/doc/base.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from enum import Enum
|
|
4
4
|
from typing import List, Tuple
|
|
5
5
|
|
|
6
|
-
from pydantic import BaseModel
|
|
6
|
+
from pydantic import BaseModel, FieldSerializationInfo, field_serializer
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class ImageRefMode(str, Enum):
|
|
@@ -21,12 +21,28 @@ class CoordOrigin(str, Enum):
|
|
|
21
21
|
BOTTOMLEFT = "BOTTOMLEFT"
|
|
22
22
|
|
|
23
23
|
|
|
24
|
+
_CTX_COORD_PREC = "coord_prec"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _serialize_precision(
|
|
28
|
+
value: float, info: FieldSerializationInfo, ctx_key: str
|
|
29
|
+
) -> float:
|
|
30
|
+
precision = info.context.get(ctx_key) if info.context else None
|
|
31
|
+
if isinstance(precision, int):
|
|
32
|
+
return round(value, precision)
|
|
33
|
+
return value
|
|
34
|
+
|
|
35
|
+
|
|
24
36
|
class Size(BaseModel):
|
|
25
37
|
"""Size."""
|
|
26
38
|
|
|
27
39
|
width: float = 0.0
|
|
28
40
|
height: float = 0.0
|
|
29
41
|
|
|
42
|
+
@field_serializer("width", "height")
|
|
43
|
+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
|
44
|
+
return _serialize_precision(value, info, _CTX_COORD_PREC)
|
|
45
|
+
|
|
30
46
|
def as_tuple(self):
|
|
31
47
|
"""as_tuple."""
|
|
32
48
|
return (self.width, self.height)
|
|
@@ -52,6 +68,10 @@ class BoundingBox(BaseModel):
|
|
|
52
68
|
"""height."""
|
|
53
69
|
return abs(self.t - self.b)
|
|
54
70
|
|
|
71
|
+
@field_serializer("l", "t", "r", "b")
|
|
72
|
+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
|
73
|
+
return _serialize_precision(value, info, _CTX_COORD_PREC)
|
|
74
|
+
|
|
55
75
|
def resize_by_scale(self, x_scale: float, y_scale: float):
|
|
56
76
|
"""resize_by_scale."""
|
|
57
77
|
return BoundingBox(
|
|
@@ -26,8 +26,10 @@ from pydantic import (
|
|
|
26
26
|
BaseModel,
|
|
27
27
|
ConfigDict,
|
|
28
28
|
Field,
|
|
29
|
+
FieldSerializationInfo,
|
|
29
30
|
StringConstraints,
|
|
30
31
|
computed_field,
|
|
32
|
+
field_serializer,
|
|
31
33
|
field_validator,
|
|
32
34
|
model_validator,
|
|
33
35
|
validate_call,
|
|
@@ -38,7 +40,12 @@ from typing_extensions import Annotated, Self, deprecated
|
|
|
38
40
|
from docling_core.search.package import VERSION_PATTERN
|
|
39
41
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
40
42
|
from docling_core.types.doc import BoundingBox, Size
|
|
41
|
-
from docling_core.types.doc.base import
|
|
43
|
+
from docling_core.types.doc.base import (
|
|
44
|
+
_CTX_COORD_PREC,
|
|
45
|
+
CoordOrigin,
|
|
46
|
+
ImageRefMode,
|
|
47
|
+
_serialize_precision,
|
|
48
|
+
)
|
|
42
49
|
from docling_core.types.doc.labels import (
|
|
43
50
|
CodeLanguageLabel,
|
|
44
51
|
DocItemLabel,
|
|
@@ -85,6 +92,8 @@ DOCUMENT_TOKENS_EXPORT_LABELS.update(
|
|
|
85
92
|
]
|
|
86
93
|
)
|
|
87
94
|
|
|
95
|
+
_CTX_CONFID_PREC = "confid_prec"
|
|
96
|
+
|
|
88
97
|
|
|
89
98
|
class BaseAnnotation(BaseModel):
|
|
90
99
|
"""Base class for all annotation types."""
|
|
@@ -98,6 +107,10 @@ class PictureClassificationClass(BaseModel):
|
|
|
98
107
|
class_name: str
|
|
99
108
|
confidence: float
|
|
100
109
|
|
|
110
|
+
@field_serializer("confidence")
|
|
111
|
+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
|
112
|
+
return _serialize_precision(value, info, _CTX_CONFID_PREC)
|
|
113
|
+
|
|
101
114
|
|
|
102
115
|
class PictureClassificationData(BaseAnnotation):
|
|
103
116
|
"""PictureClassificationData."""
|
|
@@ -125,6 +138,10 @@ class PictureMoleculeData(BaseAnnotation):
|
|
|
125
138
|
segmentation: List[Tuple[float, float]]
|
|
126
139
|
provenance: str
|
|
127
140
|
|
|
141
|
+
@field_serializer("confidence")
|
|
142
|
+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
|
143
|
+
return _serialize_precision(value, info, _CTX_CONFID_PREC)
|
|
144
|
+
|
|
128
145
|
|
|
129
146
|
class MiscAnnotation(BaseAnnotation):
|
|
130
147
|
"""MiscAnnotation."""
|
|
@@ -731,9 +748,11 @@ class ProvenanceItem(BaseModel):
|
|
|
731
748
|
class ContentLayer(str, Enum):
|
|
732
749
|
"""ContentLayer."""
|
|
733
750
|
|
|
734
|
-
BODY = "body"
|
|
735
|
-
FURNITURE = "furniture"
|
|
736
|
-
BACKGROUND = "background"
|
|
751
|
+
BODY = "body" # main content of the document
|
|
752
|
+
FURNITURE = "furniture" # eg page-headers/footers
|
|
753
|
+
BACKGROUND = "background" # eg watermarks
|
|
754
|
+
INVISIBLE = "invisible" # hidden or invisible text
|
|
755
|
+
NOTES = "notes" # author/speaker notes, corrections, etc
|
|
737
756
|
|
|
738
757
|
|
|
739
758
|
DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
|
|
@@ -3046,6 +3065,8 @@ class DoclingDocument(BaseModel):
|
|
|
3046
3065
|
artifacts_dir: Optional[Path] = None,
|
|
3047
3066
|
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
3048
3067
|
indent: int = 2,
|
|
3068
|
+
coord_precision: Optional[int] = None,
|
|
3069
|
+
confid_precision: Optional[int] = None,
|
|
3049
3070
|
):
|
|
3050
3071
|
"""Save as json."""
|
|
3051
3072
|
if isinstance(filename, str):
|
|
@@ -3059,7 +3080,9 @@ class DoclingDocument(BaseModel):
|
|
|
3059
3080
|
artifacts_dir, image_mode, reference_path=reference_path
|
|
3060
3081
|
)
|
|
3061
3082
|
|
|
3062
|
-
out = new_doc.export_to_dict(
|
|
3083
|
+
out = new_doc.export_to_dict(
|
|
3084
|
+
coord_precision=coord_precision, confid_precision=confid_precision
|
|
3085
|
+
)
|
|
3063
3086
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
3064
3087
|
json.dump(out, fw, indent=indent)
|
|
3065
3088
|
|
|
@@ -3085,6 +3108,8 @@ class DoclingDocument(BaseModel):
|
|
|
3085
3108
|
artifacts_dir: Optional[Path] = None,
|
|
3086
3109
|
image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
|
|
3087
3110
|
default_flow_style: bool = False,
|
|
3111
|
+
coord_precision: Optional[int] = None,
|
|
3112
|
+
confid_precision: Optional[int] = None,
|
|
3088
3113
|
):
|
|
3089
3114
|
"""Save as yaml."""
|
|
3090
3115
|
if isinstance(filename, str):
|
|
@@ -3098,7 +3123,9 @@ class DoclingDocument(BaseModel):
|
|
|
3098
3123
|
artifacts_dir, image_mode, reference_path=reference_path
|
|
3099
3124
|
)
|
|
3100
3125
|
|
|
3101
|
-
out = new_doc.export_to_dict(
|
|
3126
|
+
out = new_doc.export_to_dict(
|
|
3127
|
+
coord_precision=coord_precision, confid_precision=confid_precision
|
|
3128
|
+
)
|
|
3102
3129
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
3103
3130
|
yaml.dump(out, fw, default_flow_style=default_flow_style)
|
|
3104
3131
|
|
|
@@ -3123,9 +3150,18 @@ class DoclingDocument(BaseModel):
|
|
|
3123
3150
|
mode: str = "json",
|
|
3124
3151
|
by_alias: bool = True,
|
|
3125
3152
|
exclude_none: bool = True,
|
|
3153
|
+
coord_precision: Optional[int] = None,
|
|
3154
|
+
confid_precision: Optional[int] = None,
|
|
3126
3155
|
) -> Dict[str, Any]:
|
|
3127
3156
|
"""Export to dict."""
|
|
3128
|
-
|
|
3157
|
+
context = {}
|
|
3158
|
+
if coord_precision is not None:
|
|
3159
|
+
context[_CTX_COORD_PREC] = coord_precision
|
|
3160
|
+
if confid_precision is not None:
|
|
3161
|
+
context[_CTX_CONFID_PREC] = confid_precision
|
|
3162
|
+
out = self.model_dump(
|
|
3163
|
+
mode=mode, by_alias=by_alias, exclude_none=exclude_none, context=context
|
|
3164
|
+
)
|
|
3129
3165
|
|
|
3130
3166
|
return out
|
|
3131
3167
|
|
docling_core/types/doc/page.py
CHANGED
|
@@ -25,9 +25,21 @@ import numpy as np
|
|
|
25
25
|
from PIL import Image as PILImage
|
|
26
26
|
from PIL import ImageColor, ImageDraw, ImageFont
|
|
27
27
|
from PIL.ImageFont import FreeTypeFont
|
|
28
|
-
from pydantic import
|
|
28
|
+
from pydantic import (
|
|
29
|
+
AnyUrl,
|
|
30
|
+
BaseModel,
|
|
31
|
+
Field,
|
|
32
|
+
FieldSerializationInfo,
|
|
33
|
+
field_serializer,
|
|
34
|
+
model_validator,
|
|
35
|
+
)
|
|
29
36
|
|
|
30
|
-
from docling_core.types.doc.base import
|
|
37
|
+
from docling_core.types.doc.base import (
|
|
38
|
+
_CTX_COORD_PREC,
|
|
39
|
+
BoundingBox,
|
|
40
|
+
CoordOrigin,
|
|
41
|
+
_serialize_precision,
|
|
42
|
+
)
|
|
31
43
|
from docling_core.types.doc.document import ImageRef
|
|
32
44
|
|
|
33
45
|
_logger = logging.getLogger(__name__)
|
|
@@ -105,6 +117,10 @@ class BoundingRectangle(BaseModel):
|
|
|
105
117
|
|
|
106
118
|
coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT
|
|
107
119
|
|
|
120
|
+
@field_serializer("r_x0", "r_y0", "r_x1", "r_y1", "r_x2", "r_y2", "r_x3", "r_y3")
|
|
121
|
+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
|
122
|
+
return _serialize_precision(value, info, _CTX_COORD_PREC)
|
|
123
|
+
|
|
108
124
|
@property
|
|
109
125
|
def width(self) -> float:
|
|
110
126
|
"""Calculate the width of the rectangle."""
|
|
@@ -122,6 +138,8 @@ class BoundingRectangle(BaseModel):
|
|
|
122
138
|
p_1 = ((self.r_x1 + self.r_x2) / 2.0, (self.r_y1 + self.r_y2) / 2.0)
|
|
123
139
|
|
|
124
140
|
delta_x, delta_y = p_1[0] - p_0[0], p_1[1] - p_0[1]
|
|
141
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
142
|
+
delta_y = -delta_y
|
|
125
143
|
|
|
126
144
|
if abs(delta_y) < 1.0e-3:
|
|
127
145
|
angle = 0.0
|
|
@@ -131,8 +149,7 @@ class BoundingRectangle(BaseModel):
|
|
|
131
149
|
angle = math.atan(delta_y / delta_x)
|
|
132
150
|
if delta_x < 0:
|
|
133
151
|
angle += np.pi
|
|
134
|
-
|
|
135
|
-
angle += 2 * np.pi
|
|
152
|
+
angle = angle % (2 * np.pi)
|
|
136
153
|
return angle
|
|
137
154
|
|
|
138
155
|
@property
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.41.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -40,10 +40,10 @@ docling_core/transforms/visualizer/table_visualizer.py,sha256=iJPjk-XQSSCH3oujcj
|
|
|
40
40
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
41
41
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
42
42
|
docling_core/types/doc/__init__.py,sha256=8hOhm5W9mArf3zwgfoMxDs1pHizhLFSAZlLu1tPBBRk,1641
|
|
43
|
-
docling_core/types/doc/base.py,sha256=
|
|
44
|
-
docling_core/types/doc/document.py,sha256=
|
|
43
|
+
docling_core/types/doc/base.py,sha256=u8sFLA29x8QphvLzgy2wAKu3HXyM2GODfBXqEwQMrTY,15527
|
|
44
|
+
docling_core/types/doc/document.py,sha256=YAJIIdT2fBnlp8ASWvzJTjUbil_ZCwuBBSjmiwhZ1KI,158630
|
|
45
45
|
docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
|
|
46
|
-
docling_core/types/doc/page.py,sha256=
|
|
46
|
+
docling_core/types/doc/page.py,sha256=CH9DY3LLgnUdhRuJBWfnkDkPBdRzz9yi4el1LsxJSME,41651
|
|
47
47
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
48
48
|
docling_core/types/doc/utils.py,sha256=JpAi7x9DHksFlIj_gRJPcSZOHa8AHvVPEO_K9aSnw4c,2608
|
|
49
49
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
@@ -74,9 +74,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
74
74
|
docling_core/utils/legacy.py,sha256=5lghO48OEcV9V51tRnH3YSKgLtdqhr-Q5C_OcJZ8TOs,24392
|
|
75
75
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
76
76
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
77
|
-
docling_core-2.
|
|
78
|
-
docling_core-2.
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
81
|
-
docling_core-2.
|
|
82
|
-
docling_core-2.
|
|
77
|
+
docling_core-2.41.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
78
|
+
docling_core-2.41.0.dist-info/METADATA,sha256=CqsXanxB2dd22G__-Ws0XdLDzkf9uwMGmx98V2h9f3k,6453
|
|
79
|
+
docling_core-2.41.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
80
|
+
docling_core-2.41.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
81
|
+
docling_core-2.41.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
82
|
+
docling_core-2.41.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|