docling-core 2.15.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -1,6 +1,5 @@
1
1
  """Models for the base data types."""
2
2
 
3
- import copy
4
3
  from enum import Enum
5
4
  from typing import Tuple
6
5
 
@@ -53,33 +52,53 @@ class BoundingBox(BaseModel):
53
52
  """height."""
54
53
  return abs(self.t - self.b)
55
54
 
56
- def scaled(self, scale: float) -> "BoundingBox":
57
- """scaled.
58
-
59
- :param scale: float:
60
-
61
- """
62
- out_bbox = copy.deepcopy(self)
63
- out_bbox.l *= scale
64
- out_bbox.r *= scale
65
- out_bbox.t *= scale
66
- out_bbox.b *= scale
67
-
68
- return out_bbox
69
-
70
- def normalized(self, page_size: Size) -> "BoundingBox":
71
- """normalized.
72
-
73
- :param page_size: Size:
74
-
75
- """
76
- out_bbox = copy.deepcopy(self)
77
- out_bbox.l /= page_size.width
78
- out_bbox.r /= page_size.width
79
- out_bbox.t /= page_size.height
80
- out_bbox.b /= page_size.height
81
-
82
- return out_bbox
55
+ def resize_by_scale(self, x_scale: float, y_scale: float):
56
+ """resize_by_scale."""
57
+ return BoundingBox(
58
+ l=self.l * x_scale,
59
+ r=self.r * x_scale,
60
+ t=self.t * y_scale,
61
+ b=self.b * y_scale,
62
+ coord_origin=self.coord_origin,
63
+ )
64
+
65
+ def scale_to_size(self, old_size: Size, new_size: Size):
66
+ """scale_to_size."""
67
+ return self.resize_by_scale(
68
+ x_scale=new_size.width / old_size.width,
69
+ y_scale=new_size.height / old_size.height,
70
+ )
71
+
72
+ # same as before, but using the implementation above
73
+ def scaled(self, scale: float):
74
+ """scaled."""
75
+ return self.resize_by_scale(x_scale=scale, y_scale=scale)
76
+
77
+ # same as before, but using the implementation above
78
+ def normalized(self, page_size: Size):
79
+ """normalized."""
80
+ return self.scale_to_size(
81
+ old_size=page_size, new_size=Size(height=1.0, width=1.0)
82
+ )
83
+
84
+ def expand_by_scale(self, x_scale: float, y_scale: float) -> "BoundingBox":
85
+ """expand_to_size."""
86
+ if self.coord_origin == CoordOrigin.TOPLEFT:
87
+ return BoundingBox(
88
+ l=self.l - self.width * x_scale,
89
+ r=self.r + self.width * x_scale,
90
+ t=self.t - self.height * y_scale,
91
+ b=self.b + self.height * y_scale,
92
+ coord_origin=self.coord_origin,
93
+ )
94
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
95
+ return BoundingBox(
96
+ l=self.l - self.width * x_scale,
97
+ r=self.r + self.width * x_scale,
98
+ t=self.t + self.height * y_scale,
99
+ b=self.b - self.height * y_scale,
100
+ coord_origin=self.coord_origin,
101
+ )
83
102
 
84
103
  def as_tuple(self) -> Tuple[float, float, float, float]:
85
104
  """as_tuple."""
@@ -116,26 +135,27 @@ class BoundingBox(BaseModel):
116
135
 
117
136
  def area(self) -> float:
118
137
  """area."""
119
- area = (self.r - self.l) * (self.b - self.t)
120
- if self.coord_origin == CoordOrigin.BOTTOMLEFT:
121
- area = -area
122
- return area
138
+ return abs(self.r - self.l) * abs(self.b - self.t)
123
139
 
124
140
  def intersection_area_with(self, other: "BoundingBox") -> float:
125
- """intersection_area_with.
126
-
127
- :param other: "BoundingBox":
141
+ """Calculate the intersection area with another bounding box."""
142
+ if self.coord_origin != other.coord_origin:
143
+ raise ValueError("BoundingBoxes have different CoordOrigin")
128
144
 
129
- """
130
145
  # Calculate intersection coordinates
131
146
  left = max(self.l, other.l)
132
- top = max(self.t, other.t)
133
147
  right = min(self.r, other.r)
134
- bottom = min(self.b, other.b)
148
+
149
+ if self.coord_origin == CoordOrigin.TOPLEFT:
150
+ bottom = max(self.t, other.t)
151
+ top = min(self.b, other.b)
152
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
153
+ top = min(self.t, other.t)
154
+ bottom = max(self.b, other.b)
135
155
 
136
156
  # Calculate intersection dimensions
137
157
  width = right - left
138
- height = bottom - top
158
+ height = top - bottom
139
159
 
140
160
  # If the bounding boxes do not overlap, width or height will be negative
141
161
  if width <= 0 or height <= 0:
@@ -143,6 +163,27 @@ class BoundingBox(BaseModel):
143
163
 
144
164
  return width * height
145
165
 
166
+ def intersection_over_union(
167
+ self, other: "BoundingBox", eps: float = 1.0e-6
168
+ ) -> float:
169
+ """intersection_over_union."""
170
+ intersection_area = self.intersection_area_with(other=other)
171
+
172
+ union_area = (
173
+ abs(self.l - self.r) * abs(self.t - self.b)
174
+ + abs(other.l - other.r) * abs(other.t - other.b)
175
+ - intersection_area
176
+ )
177
+
178
+ return intersection_area / (union_area + eps)
179
+
180
+ def intersection_over_self(
181
+ self, other: "BoundingBox", eps: float = 1.0e-6
182
+ ) -> float:
183
+ """intersection_over_self."""
184
+ intersection_area = self.intersection_area_with(other=other)
185
+ return intersection_area / self.area()
186
+
146
187
  def to_bottom_left_origin(self, page_height: float) -> "BoundingBox":
147
188
  """to_bottom_left_origin.
148
189
 
@@ -176,3 +217,151 @@ class BoundingBox(BaseModel):
176
217
  b=page_height - self.b, # self.t
177
218
  coord_origin=CoordOrigin.TOPLEFT,
178
219
  )
220
+
221
+ def overlaps(self, other: "BoundingBox") -> bool:
222
+ """overlaps."""
223
+ return self.overlaps_horizontally(other=other) and self.overlaps_vertically(
224
+ other=other
225
+ )
226
+
227
+ def overlaps_horizontally(self, other: "BoundingBox") -> bool:
228
+ """Check if two bounding boxes overlap horizontally."""
229
+ return not (self.r <= other.l or other.r <= self.l)
230
+
231
+ def overlaps_vertically(self, other: "BoundingBox") -> bool:
232
+ """Check if two bounding boxes overlap vertically."""
233
+ if self.coord_origin != other.coord_origin:
234
+ raise ValueError("BoundingBoxes have different CoordOrigin")
235
+
236
+ # Normalize coordinates if needed
237
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
238
+ return not (self.t <= other.b or other.t <= self.b)
239
+ elif self.coord_origin == CoordOrigin.TOPLEFT:
240
+ return not (self.b <= other.t or other.b <= self.t)
241
+
242
+ def overlaps_vertically_with_iou(self, other: "BoundingBox", iou: float) -> bool:
243
+ """overlaps_y_with_iou."""
244
+ if (
245
+ self.coord_origin == CoordOrigin.BOTTOMLEFT
246
+ and other.coord_origin == CoordOrigin.BOTTOMLEFT
247
+ ):
248
+
249
+ if self.overlaps_vertically(other=other):
250
+
251
+ u0 = min(self.b, other.b)
252
+ u1 = max(self.t, other.t)
253
+
254
+ i0 = max(self.b, other.b)
255
+ i1 = min(self.t, other.t)
256
+
257
+ iou_ = float(i1 - i0) / float(u1 - u0)
258
+ return (iou_) > iou
259
+
260
+ return False
261
+
262
+ elif (
263
+ self.coord_origin == CoordOrigin.TOPLEFT
264
+ and other.coord_origin == CoordOrigin.TOPLEFT
265
+ ):
266
+ if self.overlaps_vertically(other=other):
267
+ u0 = min(self.t, other.t)
268
+ u1 = max(self.b, other.b)
269
+
270
+ i0 = max(self.t, other.t)
271
+ i1 = min(self.b, other.b)
272
+
273
+ iou_ = float(i1 - i0) / float(u1 - u0)
274
+ return (iou_) > iou
275
+
276
+ return False
277
+ else:
278
+ raise ValueError("BoundingBoxes have different CoordOrigin")
279
+
280
+ return False
281
+
282
+ def is_left_of(self, other: "BoundingBox") -> bool:
283
+ """is_left_of."""
284
+ return self.l < other.l
285
+
286
+ def is_strictly_left_of(self, other: "BoundingBox", eps: float = 0.001) -> bool:
287
+ """is_strictly_left_of."""
288
+ return (self.r + eps) < other.l
289
+
290
+ def is_above(self, other: "BoundingBox") -> bool:
291
+ """is_above."""
292
+ if (
293
+ self.coord_origin == CoordOrigin.BOTTOMLEFT
294
+ and other.coord_origin == CoordOrigin.BOTTOMLEFT
295
+ ):
296
+ return self.t > other.t
297
+
298
+ elif (
299
+ self.coord_origin == CoordOrigin.TOPLEFT
300
+ and other.coord_origin == CoordOrigin.TOPLEFT
301
+ ):
302
+ return self.t < other.t
303
+
304
+ else:
305
+ raise ValueError("BoundingBoxes have different CoordOrigin")
306
+
307
+ return False
308
+
309
+ def is_strictly_above(self, other: "BoundingBox", eps: float = 1.0e-3) -> bool:
310
+ """is_strictly_above."""
311
+ if (
312
+ self.coord_origin == CoordOrigin.BOTTOMLEFT
313
+ and other.coord_origin == CoordOrigin.BOTTOMLEFT
314
+ ):
315
+ return (self.b + eps) > other.t
316
+
317
+ elif (
318
+ self.coord_origin == CoordOrigin.TOPLEFT
319
+ and other.coord_origin == CoordOrigin.TOPLEFT
320
+ ):
321
+ return (self.b + eps) < other.t
322
+
323
+ else:
324
+ raise ValueError("BoundingBoxes have different CoordOrigin")
325
+
326
+ return False
327
+
328
+ def is_horizontally_connected(
329
+ self, elem_i: "BoundingBox", elem_j: "BoundingBox"
330
+ ) -> bool:
331
+ """is_horizontally_connected."""
332
+ if (
333
+ self.coord_origin == CoordOrigin.BOTTOMLEFT
334
+ and elem_i.coord_origin == CoordOrigin.BOTTOMLEFT
335
+ and elem_j.coord_origin == CoordOrigin.BOTTOMLEFT
336
+ ):
337
+ min_ij = min(elem_i.b, elem_j.b)
338
+ max_ij = max(elem_i.t, elem_j.t)
339
+
340
+ if self.b < max_ij and min_ij < self.t: # overlap_y
341
+ return False
342
+
343
+ if self.l < elem_i.r and elem_j.l < self.r:
344
+ return True
345
+
346
+ return False
347
+
348
+ elif (
349
+ self.coord_origin == CoordOrigin.TOPLEFT
350
+ and elem_i.coord_origin == CoordOrigin.TOPLEFT
351
+ and elem_j.coord_origin == CoordOrigin.TOPLEFT
352
+ ):
353
+ min_ij = min(elem_i.t, elem_j.t)
354
+ max_ij = max(elem_i.b, elem_j.b)
355
+
356
+ if self.t < max_ij and min_ij < self.b: # overlap_y
357
+ return False
358
+
359
+ if self.l < elem_i.r and elem_j.l < self.r:
360
+ return True
361
+
362
+ return False
363
+
364
+ else:
365
+ raise ValueError("BoundingBoxes have different CoordOrigin")
366
+
367
+ return False
@@ -585,7 +585,8 @@ class DocItem(
585
585
  crop_bbox = (
586
586
  self.prov[0]
587
587
  .bbox.to_top_left_origin(page_height=page.size.height)
588
- .scaled(scale=page_image.height / page.size.height)
588
+ .scale_to_size(old_size=page.size, new_size=page.image.size)
589
+ # .scaled(scale=page_image.height / page.size.height)
589
590
  )
590
591
  return page_image.crop(crop_bbox.as_tuple())
591
592
 
@@ -1515,6 +1516,9 @@ class DoclingDocument(BaseModel):
1515
1516
  elif label in [DocItemLabel.SECTION_HEADER]:
1516
1517
  return self.add_heading(text=text, orig=orig, prov=prov, parent=parent)
1517
1518
 
1519
+ elif label in [DocItemLabel.CODE]:
1520
+ return self.add_code(text=text, orig=orig, prov=prov, parent=parent)
1521
+
1518
1522
  else:
1519
1523
 
1520
1524
  if not parent:
@@ -1991,6 +1995,7 @@ class DoclingDocument(BaseModel):
1991
1995
  to_element: int = sys.maxsize,
1992
1996
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1993
1997
  strict_text: bool = False,
1998
+ escaping_underscores: bool = True,
1994
1999
  image_placeholder: str = "<!-- image -->",
1995
2000
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
1996
2001
  indent: int = 4,
@@ -2013,6 +2018,7 @@ class DoclingDocument(BaseModel):
2013
2018
  to_element=to_element,
2014
2019
  labels=labels,
2015
2020
  strict_text=strict_text,
2021
+ escaping_underscores=escaping_underscores,
2016
2022
  image_placeholder=image_placeholder,
2017
2023
  image_mode=image_mode,
2018
2024
  indent=indent,
@@ -2030,6 +2036,7 @@ class DoclingDocument(BaseModel):
2030
2036
  to_element: int = sys.maxsize,
2031
2037
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2032
2038
  strict_text: bool = False,
2039
+ escaping_underscores: bool = True,
2033
2040
  image_placeholder: str = "<!-- image -->",
2034
2041
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2035
2042
  indent: int = 4,
@@ -2055,6 +2062,9 @@ class DoclingDocument(BaseModel):
2055
2062
  :param strict_text: bool: Whether to only include the text content
2056
2063
  of the document. (Default value = False).
2057
2064
  :type strict_text: bool = False
2065
+ :param escaping_underscores: bool: Whether to escape underscores in the
2066
+ text content of the document. (Default value = True).
2067
+ :type escaping_underscores: bool = True
2058
2068
  :param image_placeholder: The placeholder to include to position
2059
2069
  images in the markdown. (Default value = "\<!-- image --\>").
2060
2070
  :type image_placeholder: str = "<!-- image -->"
@@ -2157,6 +2167,10 @@ class DoclingDocument(BaseModel):
2157
2167
  text = f"{list_indent}{marker} {item.text}"
2158
2168
  mdtexts.append(text)
2159
2169
 
2170
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
2171
+ in_list = False
2172
+ mdtexts.append(f"$${item.text}$$")
2173
+
2160
2174
  elif isinstance(item, TextItem) and item.label in labels:
2161
2175
  in_list = False
2162
2176
  if len(item.text) and text_width > 0:
@@ -2205,10 +2219,14 @@ class DoclingDocument(BaseModel):
2205
2219
  """Escape underscores but leave them intact in the URL.."""
2206
2220
  # Firstly, identify all the URL patterns.
2207
2221
  url_pattern = r"!\[.*?\]\((.*?)\)"
2222
+ # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
2223
+ latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
2224
+ combined_pattern = f"({url_pattern})|({latex_pattern})"
2225
+
2208
2226
  parts = []
2209
2227
  last_end = 0
2210
2228
 
2211
- for match in re.finditer(url_pattern, text):
2229
+ for match in re.finditer(combined_pattern, text):
2212
2230
  # Text to add before the URL (needs to be escaped)
2213
2231
  before_url = text[last_end : match.start()]
2214
2232
  parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
@@ -2223,7 +2241,8 @@ class DoclingDocument(BaseModel):
2223
2241
 
2224
2242
  return "".join(parts)
2225
2243
 
2226
- mdtext = escape_underscores(mdtext)
2244
+ if escaping_underscores:
2245
+ mdtext = escape_underscores(mdtext)
2227
2246
 
2228
2247
  return mdtext
2229
2248
 
@@ -2241,6 +2260,7 @@ class DoclingDocument(BaseModel):
2241
2260
  to_element,
2242
2261
  labels,
2243
2262
  strict_text=True,
2263
+ escaping_underscores=False,
2244
2264
  image_placeholder="",
2245
2265
  )
2246
2266
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.15.0
3
+ Version: 3.0.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -23,8 +23,8 @@ docling_core/transforms/chunker/hybrid_chunker.py,sha256=kokjDdxjc_gygOokQwYFVnH
23
23
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
24
24
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
25
25
  docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
26
- docling_core/types/doc/base.py,sha256=O5GH3psrV5dSWhNzZRpOtGEImk-XpqLJ7Et5VGm0tcU,4651
27
- docling_core/types/doc/document.py,sha256=VlB_scwBHTKUJD7q89PE1zH4L3Q5T-esQ1Sg9csnxbo,93847
26
+ docling_core/types/doc/base.py,sha256=lMRNq1DUK7K26L2VNZRqFaItCSZ6m9BdYTVaJA98PZQ,11495
27
+ docling_core/types/doc/document.py,sha256=1AGj1xQJ1XLP2nkgHIn8-BDNDic5OPll35cg051B6Ts,94839
28
28
  docling_core/types/doc/labels.py,sha256=8Luymal9SKXTwyqq1ONKiUTxuMo_nRMYfBkRPFkdSSo,5306
29
29
  docling_core/types/doc/tokens.py,sha256=GMtm5TsNljBPaMYkgmD3WWZmC0FHqKF9imKEEySz4ps,6020
30
30
  docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
56
56
  docling_core/utils/legacy.py,sha256=xfp7U0JqjI60K3loWiNTk8w08_KfCUzTb2MNULBOIz4,24396
57
57
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
58
58
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
59
- docling_core-2.15.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
- docling_core-2.15.0.dist-info/METADATA,sha256=FyObOk5pW4t7tc5FjK8AshHE1FfXIFx3ok2i2M3lyiw,5744
61
- docling_core-2.15.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
- docling_core-2.15.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
- docling_core-2.15.0.dist-info/RECORD,,
59
+ docling_core-3.0.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
+ docling_core-3.0.0.dist-info/METADATA,sha256=xwCWMsl3H5K_OCT_4Iz77zE4RVAW8JnKy19X02P-rAI,5743
61
+ docling_core-3.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
+ docling_core-3.0.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
+ docling_core-3.0.0.dist-info/RECORD,,