docling-core 2.39.0__py3-none-any.whl → 2.41.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -3,7 +3,7 @@
3
3
  from enum import Enum
4
4
  from typing import List, Tuple
5
5
 
6
- from pydantic import BaseModel
6
+ from pydantic import BaseModel, FieldSerializationInfo, field_serializer
7
7
 
8
8
 
9
9
  class ImageRefMode(str, Enum):
@@ -21,12 +21,28 @@ class CoordOrigin(str, Enum):
21
21
  BOTTOMLEFT = "BOTTOMLEFT"
22
22
 
23
23
 
24
+ _CTX_COORD_PREC = "coord_prec"
25
+
26
+
27
+ def _serialize_precision(
28
+ value: float, info: FieldSerializationInfo, ctx_key: str
29
+ ) -> float:
30
+ precision = info.context.get(ctx_key) if info.context else None
31
+ if isinstance(precision, int):
32
+ return round(value, precision)
33
+ return value
34
+
35
+
24
36
  class Size(BaseModel):
25
37
  """Size."""
26
38
 
27
39
  width: float = 0.0
28
40
  height: float = 0.0
29
41
 
42
+ @field_serializer("width", "height")
43
+ def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
44
+ return _serialize_precision(value, info, _CTX_COORD_PREC)
45
+
30
46
  def as_tuple(self):
31
47
  """as_tuple."""
32
48
  return (self.width, self.height)
@@ -52,6 +68,10 @@ class BoundingBox(BaseModel):
52
68
  """height."""
53
69
  return abs(self.t - self.b)
54
70
 
71
+ @field_serializer("l", "t", "r", "b")
72
+ def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
73
+ return _serialize_precision(value, info, _CTX_COORD_PREC)
74
+
55
75
  def resize_by_scale(self, x_scale: float, y_scale: float):
56
76
  """resize_by_scale."""
57
77
  return BoundingBox(
@@ -26,8 +26,10 @@ from pydantic import (
26
26
  BaseModel,
27
27
  ConfigDict,
28
28
  Field,
29
+ FieldSerializationInfo,
29
30
  StringConstraints,
30
31
  computed_field,
32
+ field_serializer,
31
33
  field_validator,
32
34
  model_validator,
33
35
  validate_call,
@@ -38,7 +40,12 @@ from typing_extensions import Annotated, Self, deprecated
38
40
  from docling_core.search.package import VERSION_PATTERN
39
41
  from docling_core.types.base import _JSON_POINTER_REGEX
40
42
  from docling_core.types.doc import BoundingBox, Size
41
- from docling_core.types.doc.base import CoordOrigin, ImageRefMode
43
+ from docling_core.types.doc.base import (
44
+ _CTX_COORD_PREC,
45
+ CoordOrigin,
46
+ ImageRefMode,
47
+ _serialize_precision,
48
+ )
42
49
  from docling_core.types.doc.labels import (
43
50
  CodeLanguageLabel,
44
51
  DocItemLabel,
@@ -85,6 +92,8 @@ DOCUMENT_TOKENS_EXPORT_LABELS.update(
85
92
  ]
86
93
  )
87
94
 
95
+ _CTX_CONFID_PREC = "confid_prec"
96
+
88
97
 
89
98
  class BaseAnnotation(BaseModel):
90
99
  """Base class for all annotation types."""
@@ -98,6 +107,10 @@ class PictureClassificationClass(BaseModel):
98
107
  class_name: str
99
108
  confidence: float
100
109
 
110
+ @field_serializer("confidence")
111
+ def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
112
+ return _serialize_precision(value, info, _CTX_CONFID_PREC)
113
+
101
114
 
102
115
  class PictureClassificationData(BaseAnnotation):
103
116
  """PictureClassificationData."""
@@ -125,6 +138,10 @@ class PictureMoleculeData(BaseAnnotation):
125
138
  segmentation: List[Tuple[float, float]]
126
139
  provenance: str
127
140
 
141
+ @field_serializer("confidence")
142
+ def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
143
+ return _serialize_precision(value, info, _CTX_CONFID_PREC)
144
+
128
145
 
129
146
  class MiscAnnotation(BaseAnnotation):
130
147
  """MiscAnnotation."""
@@ -731,9 +748,11 @@ class ProvenanceItem(BaseModel):
731
748
  class ContentLayer(str, Enum):
732
749
  """ContentLayer."""
733
750
 
734
- BODY = "body"
735
- FURNITURE = "furniture"
736
- BACKGROUND = "background"
751
+ BODY = "body" # main content of the document
752
+ FURNITURE = "furniture" # eg page-headers/footers
753
+ BACKGROUND = "background" # eg watermarks
754
+ INVISIBLE = "invisible" # hidden or invisible text
755
+ NOTES = "notes" # author/speaker notes, corrections, etc
737
756
 
738
757
 
739
758
  DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
@@ -3046,6 +3065,8 @@ class DoclingDocument(BaseModel):
3046
3065
  artifacts_dir: Optional[Path] = None,
3047
3066
  image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
3048
3067
  indent: int = 2,
3068
+ coord_precision: Optional[int] = None,
3069
+ confid_precision: Optional[int] = None,
3049
3070
  ):
3050
3071
  """Save as json."""
3051
3072
  if isinstance(filename, str):
@@ -3059,7 +3080,9 @@ class DoclingDocument(BaseModel):
3059
3080
  artifacts_dir, image_mode, reference_path=reference_path
3060
3081
  )
3061
3082
 
3062
- out = new_doc.export_to_dict()
3083
+ out = new_doc.export_to_dict(
3084
+ coord_precision=coord_precision, confid_precision=confid_precision
3085
+ )
3063
3086
  with open(filename, "w", encoding="utf-8") as fw:
3064
3087
  json.dump(out, fw, indent=indent)
3065
3088
 
@@ -3085,6 +3108,8 @@ class DoclingDocument(BaseModel):
3085
3108
  artifacts_dir: Optional[Path] = None,
3086
3109
  image_mode: ImageRefMode = ImageRefMode.EMBEDDED,
3087
3110
  default_flow_style: bool = False,
3111
+ coord_precision: Optional[int] = None,
3112
+ confid_precision: Optional[int] = None,
3088
3113
  ):
3089
3114
  """Save as yaml."""
3090
3115
  if isinstance(filename, str):
@@ -3098,7 +3123,9 @@ class DoclingDocument(BaseModel):
3098
3123
  artifacts_dir, image_mode, reference_path=reference_path
3099
3124
  )
3100
3125
 
3101
- out = new_doc.export_to_dict()
3126
+ out = new_doc.export_to_dict(
3127
+ coord_precision=coord_precision, confid_precision=confid_precision
3128
+ )
3102
3129
  with open(filename, "w", encoding="utf-8") as fw:
3103
3130
  yaml.dump(out, fw, default_flow_style=default_flow_style)
3104
3131
 
@@ -3123,9 +3150,18 @@ class DoclingDocument(BaseModel):
3123
3150
  mode: str = "json",
3124
3151
  by_alias: bool = True,
3125
3152
  exclude_none: bool = True,
3153
+ coord_precision: Optional[int] = None,
3154
+ confid_precision: Optional[int] = None,
3126
3155
  ) -> Dict[str, Any]:
3127
3156
  """Export to dict."""
3128
- out = self.model_dump(mode=mode, by_alias=by_alias, exclude_none=exclude_none)
3157
+ context = {}
3158
+ if coord_precision is not None:
3159
+ context[_CTX_COORD_PREC] = coord_precision
3160
+ if confid_precision is not None:
3161
+ context[_CTX_CONFID_PREC] = confid_precision
3162
+ out = self.model_dump(
3163
+ mode=mode, by_alias=by_alias, exclude_none=exclude_none, context=context
3164
+ )
3129
3165
 
3130
3166
  return out
3131
3167
 
@@ -25,9 +25,21 @@ import numpy as np
25
25
  from PIL import Image as PILImage
26
26
  from PIL import ImageColor, ImageDraw, ImageFont
27
27
  from PIL.ImageFont import FreeTypeFont
28
- from pydantic import AnyUrl, BaseModel, Field, model_validator
28
+ from pydantic import (
29
+ AnyUrl,
30
+ BaseModel,
31
+ Field,
32
+ FieldSerializationInfo,
33
+ field_serializer,
34
+ model_validator,
35
+ )
29
36
 
30
- from docling_core.types.doc.base import BoundingBox, CoordOrigin
37
+ from docling_core.types.doc.base import (
38
+ _CTX_COORD_PREC,
39
+ BoundingBox,
40
+ CoordOrigin,
41
+ _serialize_precision,
42
+ )
31
43
  from docling_core.types.doc.document import ImageRef
32
44
 
33
45
  _logger = logging.getLogger(__name__)
@@ -105,6 +117,10 @@ class BoundingRectangle(BaseModel):
105
117
 
106
118
  coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT
107
119
 
120
+ @field_serializer("r_x0", "r_y0", "r_x1", "r_y1", "r_x2", "r_y2", "r_x3", "r_y3")
121
+ def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
122
+ return _serialize_precision(value, info, _CTX_COORD_PREC)
123
+
108
124
  @property
109
125
  def width(self) -> float:
110
126
  """Calculate the width of the rectangle."""
@@ -122,6 +138,8 @@ class BoundingRectangle(BaseModel):
122
138
  p_1 = ((self.r_x1 + self.r_x2) / 2.0, (self.r_y1 + self.r_y2) / 2.0)
123
139
 
124
140
  delta_x, delta_y = p_1[0] - p_0[0], p_1[1] - p_0[1]
141
+ if self.coord_origin == CoordOrigin.TOPLEFT:
142
+ delta_y = -delta_y
125
143
 
126
144
  if abs(delta_y) < 1.0e-3:
127
145
  angle = 0.0
@@ -131,8 +149,7 @@ class BoundingRectangle(BaseModel):
131
149
  angle = math.atan(delta_y / delta_x)
132
150
  if delta_x < 0:
133
151
  angle += np.pi
134
- if angle < 0:
135
- angle += 2 * np.pi
152
+ angle = angle % (2 * np.pi)
136
153
  return angle
137
154
 
138
155
  @property
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.39.0
3
+ Version: 2.41.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -40,10 +40,10 @@ docling_core/transforms/visualizer/table_visualizer.py,sha256=iJPjk-XQSSCH3oujcj
40
40
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
41
41
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
42
42
  docling_core/types/doc/__init__.py,sha256=8hOhm5W9mArf3zwgfoMxDs1pHizhLFSAZlLu1tPBBRk,1641
43
- docling_core/types/doc/base.py,sha256=ndXquBrOKTFQApIJ5s2-zstj3xlVKRbJDSId0KOQnUg,14817
44
- docling_core/types/doc/document.py,sha256=Wu9st8xomDWF8a-GtHn6TSGbkqiMxPLzCUCnYXMhgOc,157212
43
+ docling_core/types/doc/base.py,sha256=u8sFLA29x8QphvLzgy2wAKu3HXyM2GODfBXqEwQMrTY,15527
44
+ docling_core/types/doc/document.py,sha256=YAJIIdT2fBnlp8ASWvzJTjUbil_ZCwuBBSjmiwhZ1KI,158630
45
45
  docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
46
- docling_core/types/doc/page.py,sha256=GV9UnGCvvqs6KD_ac3hF6b_NH6M6IevsL5iSt8WWVCI,41221
46
+ docling_core/types/doc/page.py,sha256=CH9DY3LLgnUdhRuJBWfnkDkPBdRzz9yi4el1LsxJSME,41651
47
47
  docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
48
48
  docling_core/types/doc/utils.py,sha256=JpAi7x9DHksFlIj_gRJPcSZOHa8AHvVPEO_K9aSnw4c,2608
49
49
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
@@ -74,9 +74,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
74
74
  docling_core/utils/legacy.py,sha256=5lghO48OEcV9V51tRnH3YSKgLtdqhr-Q5C_OcJZ8TOs,24392
75
75
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
76
76
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
77
- docling_core-2.39.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
78
- docling_core-2.39.0.dist-info/METADATA,sha256=Yn1ptbMIE-Tj9y-pSXuKpdotyq34lyHnFWRxWNT-9qY,6453
79
- docling_core-2.39.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
80
- docling_core-2.39.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
81
- docling_core-2.39.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
82
- docling_core-2.39.0.dist-info/RECORD,,
77
+ docling_core-2.41.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
78
+ docling_core-2.41.0.dist-info/METADATA,sha256=CqsXanxB2dd22G__-Ws0XdLDzkf9uwMGmx98V2h9f3k,6453
79
+ docling_core-2.41.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
80
+ docling_core-2.41.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
81
+ docling_core-2.41.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
82
+ docling_core-2.41.0.dist-info/RECORD,,