docling-core 2.15.1__tar.gz → 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (63) hide show
  1. {docling_core-2.15.1 → docling_core-3.0.0}/PKG-INFO +1 -1
  2. docling_core-3.0.0/docling_core/types/doc/base.py +367 -0
  3. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/doc/document.py +20 -3
  4. {docling_core-2.15.1 → docling_core-3.0.0}/pyproject.toml +1 -1
  5. docling_core-2.15.1/docling_core/types/doc/base.py +0 -178
  6. {docling_core-2.15.1 → docling_core-3.0.0}/LICENSE +0 -0
  7. {docling_core-2.15.1 → docling_core-3.0.0}/README.md +0 -0
  8. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/__init__.py +0 -0
  9. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/cli/__init__.py +0 -0
  10. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/cli/view.py +0 -0
  11. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/py.typed +0 -0
  12. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  13. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  14. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  15. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  16. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  17. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  18. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  19. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  20. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/search/__init__.py +0 -0
  21. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  22. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/search/mapping.py +0 -0
  23. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/search/meta.py +0 -0
  24. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/search/package.py +0 -0
  25. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/transforms/__init__.py +0 -0
  26. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/transforms/chunker/__init__.py +0 -0
  27. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/transforms/chunker/base.py +0 -0
  28. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  29. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  30. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/__init__.py +0 -0
  31. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/base.py +0 -0
  32. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/doc/__init__.py +0 -0
  33. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/doc/labels.py +0 -0
  34. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/doc/tokens.py +0 -0
  35. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/doc/utils.py +0 -0
  36. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/gen/__init__.py +0 -0
  37. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/gen/generic.py +0 -0
  38. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/io/__init__.py +0 -0
  39. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  40. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/legacy_doc/base.py +0 -0
  41. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  42. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  43. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  44. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/legacy_doc/document.py +0 -0
  45. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  46. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/nlp/__init__.py +0 -0
  47. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/nlp/qa.py +0 -0
  48. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/nlp/qa_labels.py +0 -0
  49. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/rec/__init__.py +0 -0
  50. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/rec/attribute.py +0 -0
  51. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/rec/base.py +0 -0
  52. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/rec/predicate.py +0 -0
  53. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/rec/record.py +0 -0
  54. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/rec/statement.py +0 -0
  55. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/types/rec/subject.py +0 -0
  56. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/utils/__init__.py +0 -0
  57. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/utils/alias.py +0 -0
  58. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/utils/file.py +0 -0
  59. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/utils/generate_docs.py +0 -0
  60. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/utils/generate_jsonschema.py +0 -0
  61. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/utils/legacy.py +0 -0
  62. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/utils/validate.py +0 -0
  63. {docling_core-2.15.1 → docling_core-3.0.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.15.1
3
+ Version: 3.0.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -0,0 +1,367 @@
1
+ """Models for the base data types."""
2
+
3
+ from enum import Enum
4
+ from typing import Tuple
5
+
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class ImageRefMode(str, Enum):
10
+ """ImageRefMode."""
11
+
12
+ PLACEHOLDER = "placeholder" # just a place-holder
13
+ EMBEDDED = "embedded" # embed the image as a base64
14
+ REFERENCED = "referenced" # reference the image via uri
15
+
16
+
17
+ class CoordOrigin(str, Enum):
18
+ """CoordOrigin."""
19
+
20
+ TOPLEFT = "TOPLEFT"
21
+ BOTTOMLEFT = "BOTTOMLEFT"
22
+
23
+
24
+ class Size(BaseModel):
25
+ """Size."""
26
+
27
+ width: float = 0.0
28
+ height: float = 0.0
29
+
30
+ def as_tuple(self):
31
+ """as_tuple."""
32
+ return (self.width, self.height)
33
+
34
+
35
+ class BoundingBox(BaseModel):
36
+ """BoundingBox."""
37
+
38
+ l: float # left
39
+ t: float # top
40
+ r: float # right
41
+ b: float # bottom
42
+
43
+ coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
44
+
45
+ @property
46
+ def width(self):
47
+ """width."""
48
+ return self.r - self.l
49
+
50
+ @property
51
+ def height(self):
52
+ """height."""
53
+ return abs(self.t - self.b)
54
+
55
+ def resize_by_scale(self, x_scale: float, y_scale: float):
56
+ """resize_by_scale."""
57
+ return BoundingBox(
58
+ l=self.l * x_scale,
59
+ r=self.r * x_scale,
60
+ t=self.t * y_scale,
61
+ b=self.b * y_scale,
62
+ coord_origin=self.coord_origin,
63
+ )
64
+
65
+ def scale_to_size(self, old_size: Size, new_size: Size):
66
+ """scale_to_size."""
67
+ return self.resize_by_scale(
68
+ x_scale=new_size.width / old_size.width,
69
+ y_scale=new_size.height / old_size.height,
70
+ )
71
+
72
+ # same as before, but using the implementation above
73
+ def scaled(self, scale: float):
74
+ """scaled."""
75
+ return self.resize_by_scale(x_scale=scale, y_scale=scale)
76
+
77
+ # same as before, but using the implementation above
78
+ def normalized(self, page_size: Size):
79
+ """normalized."""
80
+ return self.scale_to_size(
81
+ old_size=page_size, new_size=Size(height=1.0, width=1.0)
82
+ )
83
+
84
+ def expand_by_scale(self, x_scale: float, y_scale: float) -> "BoundingBox":
85
+ """expand_to_size."""
86
+ if self.coord_origin == CoordOrigin.TOPLEFT:
87
+ return BoundingBox(
88
+ l=self.l - self.width * x_scale,
89
+ r=self.r + self.width * x_scale,
90
+ t=self.t - self.height * y_scale,
91
+ b=self.b + self.height * y_scale,
92
+ coord_origin=self.coord_origin,
93
+ )
94
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
95
+ return BoundingBox(
96
+ l=self.l - self.width * x_scale,
97
+ r=self.r + self.width * x_scale,
98
+ t=self.t + self.height * y_scale,
99
+ b=self.b - self.height * y_scale,
100
+ coord_origin=self.coord_origin,
101
+ )
102
+
103
+ def as_tuple(self) -> Tuple[float, float, float, float]:
104
+ """as_tuple."""
105
+ if self.coord_origin == CoordOrigin.TOPLEFT:
106
+ return (self.l, self.t, self.r, self.b)
107
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
108
+ return (self.l, self.b, self.r, self.t)
109
+
110
+ @classmethod
111
+ def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
112
+ """from_tuple.
113
+
114
+ :param coord: Tuple[float:
115
+ :param ...]:
116
+ :param origin: CoordOrigin:
117
+
118
+ """
119
+ if origin == CoordOrigin.TOPLEFT:
120
+ l, t, r, b = coord[0], coord[1], coord[2], coord[3]
121
+ if r < l:
122
+ l, r = r, l
123
+ if b < t:
124
+ b, t = t, b
125
+
126
+ return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
127
+ elif origin == CoordOrigin.BOTTOMLEFT:
128
+ l, b, r, t = coord[0], coord[1], coord[2], coord[3]
129
+ if r < l:
130
+ l, r = r, l
131
+ if b > t:
132
+ b, t = t, b
133
+
134
+ return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
135
+
136
+ def area(self) -> float:
137
+ """area."""
138
+ return abs(self.r - self.l) * abs(self.b - self.t)
139
+
140
+ def intersection_area_with(self, other: "BoundingBox") -> float:
141
+ """Calculate the intersection area with another bounding box."""
142
+ if self.coord_origin != other.coord_origin:
143
+ raise ValueError("BoundingBoxes have different CoordOrigin")
144
+
145
+ # Calculate intersection coordinates
146
+ left = max(self.l, other.l)
147
+ right = min(self.r, other.r)
148
+
149
+ if self.coord_origin == CoordOrigin.TOPLEFT:
150
+ bottom = max(self.t, other.t)
151
+ top = min(self.b, other.b)
152
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
153
+ top = min(self.t, other.t)
154
+ bottom = max(self.b, other.b)
155
+
156
+ # Calculate intersection dimensions
157
+ width = right - left
158
+ height = top - bottom
159
+
160
+ # If the bounding boxes do not overlap, width or height will be negative
161
+ if width <= 0 or height <= 0:
162
+ return 0.0
163
+
164
+ return width * height
165
+
166
+ def intersection_over_union(
167
+ self, other: "BoundingBox", eps: float = 1.0e-6
168
+ ) -> float:
169
+ """intersection_over_union."""
170
+ intersection_area = self.intersection_area_with(other=other)
171
+
172
+ union_area = (
173
+ abs(self.l - self.r) * abs(self.t - self.b)
174
+ + abs(other.l - other.r) * abs(other.t - other.b)
175
+ - intersection_area
176
+ )
177
+
178
+ return intersection_area / (union_area + eps)
179
+
180
+ def intersection_over_self(
181
+ self, other: "BoundingBox", eps: float = 1.0e-6
182
+ ) -> float:
183
+ """intersection_over_self."""
184
+ intersection_area = self.intersection_area_with(other=other)
185
+ return intersection_area / self.area()
186
+
187
+ def to_bottom_left_origin(self, page_height: float) -> "BoundingBox":
188
+ """to_bottom_left_origin.
189
+
190
+ :param page_height:
191
+
192
+ """
193
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
194
+ return self.model_copy()
195
+ elif self.coord_origin == CoordOrigin.TOPLEFT:
196
+ return BoundingBox(
197
+ l=self.l,
198
+ r=self.r,
199
+ t=page_height - self.t,
200
+ b=page_height - self.b,
201
+ coord_origin=CoordOrigin.BOTTOMLEFT,
202
+ )
203
+
204
+ def to_top_left_origin(self, page_height: float) -> "BoundingBox":
205
+ """to_top_left_origin.
206
+
207
+ :param page_height:
208
+
209
+ """
210
+ if self.coord_origin == CoordOrigin.TOPLEFT:
211
+ return self.model_copy()
212
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
213
+ return BoundingBox(
214
+ l=self.l,
215
+ r=self.r,
216
+ t=page_height - self.t, # self.b
217
+ b=page_height - self.b, # self.t
218
+ coord_origin=CoordOrigin.TOPLEFT,
219
+ )
220
+
221
+ def overlaps(self, other: "BoundingBox") -> bool:
222
+ """overlaps."""
223
+ return self.overlaps_horizontally(other=other) and self.overlaps_vertically(
224
+ other=other
225
+ )
226
+
227
+ def overlaps_horizontally(self, other: "BoundingBox") -> bool:
228
+ """Check if two bounding boxes overlap horizontally."""
229
+ return not (self.r <= other.l or other.r <= self.l)
230
+
231
+ def overlaps_vertically(self, other: "BoundingBox") -> bool:
232
+ """Check if two bounding boxes overlap vertically."""
233
+ if self.coord_origin != other.coord_origin:
234
+ raise ValueError("BoundingBoxes have different CoordOrigin")
235
+
236
+ # Normalize coordinates if needed
237
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
238
+ return not (self.t <= other.b or other.t <= self.b)
239
+ elif self.coord_origin == CoordOrigin.TOPLEFT:
240
+ return not (self.b <= other.t or other.b <= self.t)
241
+
242
+ def overlaps_vertically_with_iou(self, other: "BoundingBox", iou: float) -> bool:
243
+ """overlaps_y_with_iou."""
244
+ if (
245
+ self.coord_origin == CoordOrigin.BOTTOMLEFT
246
+ and other.coord_origin == CoordOrigin.BOTTOMLEFT
247
+ ):
248
+
249
+ if self.overlaps_vertically(other=other):
250
+
251
+ u0 = min(self.b, other.b)
252
+ u1 = max(self.t, other.t)
253
+
254
+ i0 = max(self.b, other.b)
255
+ i1 = min(self.t, other.t)
256
+
257
+ iou_ = float(i1 - i0) / float(u1 - u0)
258
+ return (iou_) > iou
259
+
260
+ return False
261
+
262
+ elif (
263
+ self.coord_origin == CoordOrigin.TOPLEFT
264
+ and other.coord_origin == CoordOrigin.TOPLEFT
265
+ ):
266
+ if self.overlaps_vertically(other=other):
267
+ u0 = min(self.t, other.t)
268
+ u1 = max(self.b, other.b)
269
+
270
+ i0 = max(self.t, other.t)
271
+ i1 = min(self.b, other.b)
272
+
273
+ iou_ = float(i1 - i0) / float(u1 - u0)
274
+ return (iou_) > iou
275
+
276
+ return False
277
+ else:
278
+ raise ValueError("BoundingBoxes have different CoordOrigin")
279
+
280
+ return False
281
+
282
+ def is_left_of(self, other: "BoundingBox") -> bool:
283
+ """is_left_of."""
284
+ return self.l < other.l
285
+
286
+ def is_strictly_left_of(self, other: "BoundingBox", eps: float = 0.001) -> bool:
287
+ """is_strictly_left_of."""
288
+ return (self.r + eps) < other.l
289
+
290
+ def is_above(self, other: "BoundingBox") -> bool:
291
+ """is_above."""
292
+ if (
293
+ self.coord_origin == CoordOrigin.BOTTOMLEFT
294
+ and other.coord_origin == CoordOrigin.BOTTOMLEFT
295
+ ):
296
+ return self.t > other.t
297
+
298
+ elif (
299
+ self.coord_origin == CoordOrigin.TOPLEFT
300
+ and other.coord_origin == CoordOrigin.TOPLEFT
301
+ ):
302
+ return self.t < other.t
303
+
304
+ else:
305
+ raise ValueError("BoundingBoxes have different CoordOrigin")
306
+
307
+ return False
308
+
309
+ def is_strictly_above(self, other: "BoundingBox", eps: float = 1.0e-3) -> bool:
310
+ """is_strictly_above."""
311
+ if (
312
+ self.coord_origin == CoordOrigin.BOTTOMLEFT
313
+ and other.coord_origin == CoordOrigin.BOTTOMLEFT
314
+ ):
315
+ return (self.b + eps) > other.t
316
+
317
+ elif (
318
+ self.coord_origin == CoordOrigin.TOPLEFT
319
+ and other.coord_origin == CoordOrigin.TOPLEFT
320
+ ):
321
+ return (self.b + eps) < other.t
322
+
323
+ else:
324
+ raise ValueError("BoundingBoxes have different CoordOrigin")
325
+
326
+ return False
327
+
328
+ def is_horizontally_connected(
329
+ self, elem_i: "BoundingBox", elem_j: "BoundingBox"
330
+ ) -> bool:
331
+ """is_horizontally_connected."""
332
+ if (
333
+ self.coord_origin == CoordOrigin.BOTTOMLEFT
334
+ and elem_i.coord_origin == CoordOrigin.BOTTOMLEFT
335
+ and elem_j.coord_origin == CoordOrigin.BOTTOMLEFT
336
+ ):
337
+ min_ij = min(elem_i.b, elem_j.b)
338
+ max_ij = max(elem_i.t, elem_j.t)
339
+
340
+ if self.b < max_ij and min_ij < self.t: # overlap_y
341
+ return False
342
+
343
+ if self.l < elem_i.r and elem_j.l < self.r:
344
+ return True
345
+
346
+ return False
347
+
348
+ elif (
349
+ self.coord_origin == CoordOrigin.TOPLEFT
350
+ and elem_i.coord_origin == CoordOrigin.TOPLEFT
351
+ and elem_j.coord_origin == CoordOrigin.TOPLEFT
352
+ ):
353
+ min_ij = min(elem_i.t, elem_j.t)
354
+ max_ij = max(elem_i.b, elem_j.b)
355
+
356
+ if self.t < max_ij and min_ij < self.b: # overlap_y
357
+ return False
358
+
359
+ if self.l < elem_i.r and elem_j.l < self.r:
360
+ return True
361
+
362
+ return False
363
+
364
+ else:
365
+ raise ValueError("BoundingBoxes have different CoordOrigin")
366
+
367
+ return False
@@ -585,7 +585,8 @@ class DocItem(
585
585
  crop_bbox = (
586
586
  self.prov[0]
587
587
  .bbox.to_top_left_origin(page_height=page.size.height)
588
- .scaled(scale=page_image.height / page.size.height)
588
+ .scale_to_size(old_size=page.size, new_size=page.image.size)
589
+ # .scaled(scale=page_image.height / page.size.height)
589
590
  )
590
591
  return page_image.crop(crop_bbox.as_tuple())
591
592
 
@@ -1994,6 +1995,7 @@ class DoclingDocument(BaseModel):
1994
1995
  to_element: int = sys.maxsize,
1995
1996
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
1996
1997
  strict_text: bool = False,
1998
+ escaping_underscores: bool = True,
1997
1999
  image_placeholder: str = "<!-- image -->",
1998
2000
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
1999
2001
  indent: int = 4,
@@ -2016,6 +2018,7 @@ class DoclingDocument(BaseModel):
2016
2018
  to_element=to_element,
2017
2019
  labels=labels,
2018
2020
  strict_text=strict_text,
2021
+ escaping_underscores=escaping_underscores,
2019
2022
  image_placeholder=image_placeholder,
2020
2023
  image_mode=image_mode,
2021
2024
  indent=indent,
@@ -2033,6 +2036,7 @@ class DoclingDocument(BaseModel):
2033
2036
  to_element: int = sys.maxsize,
2034
2037
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
2035
2038
  strict_text: bool = False,
2039
+ escaping_underscores: bool = True,
2036
2040
  image_placeholder: str = "<!-- image -->",
2037
2041
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2038
2042
  indent: int = 4,
@@ -2058,6 +2062,9 @@ class DoclingDocument(BaseModel):
2058
2062
  :param strict_text: bool: Whether to only include the text content
2059
2063
  of the document. (Default value = False).
2060
2064
  :type strict_text: bool = False
2065
+ :param escaping_underscores: bool: Whether to escape underscores in the
2066
+ text content of the document. (Default value = True).
2067
+ :type escaping_underscores: bool = True
2061
2068
  :param image_placeholder: The placeholder to include to position
2062
2069
  images in the markdown. (Default value = "\<!-- image --\>").
2063
2070
  :type image_placeholder: str = "<!-- image -->"
@@ -2160,6 +2167,10 @@ class DoclingDocument(BaseModel):
2160
2167
  text = f"{list_indent}{marker} {item.text}"
2161
2168
  mdtexts.append(text)
2162
2169
 
2170
+ elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
2171
+ in_list = False
2172
+ mdtexts.append(f"$${item.text}$$")
2173
+
2163
2174
  elif isinstance(item, TextItem) and item.label in labels:
2164
2175
  in_list = False
2165
2176
  if len(item.text) and text_width > 0:
@@ -2208,10 +2219,14 @@ class DoclingDocument(BaseModel):
2208
2219
  """Escape underscores but leave them intact in the URL.."""
2209
2220
  # Firstly, identify all the URL patterns.
2210
2221
  url_pattern = r"!\[.*?\]\((.*?)\)"
2222
+ # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
2223
+ latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
2224
+ combined_pattern = f"({url_pattern})|({latex_pattern})"
2225
+
2211
2226
  parts = []
2212
2227
  last_end = 0
2213
2228
 
2214
- for match in re.finditer(url_pattern, text):
2229
+ for match in re.finditer(combined_pattern, text):
2215
2230
  # Text to add before the URL (needs to be escaped)
2216
2231
  before_url = text[last_end : match.start()]
2217
2232
  parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
@@ -2226,7 +2241,8 @@ class DoclingDocument(BaseModel):
2226
2241
 
2227
2242
  return "".join(parts)
2228
2243
 
2229
- mdtext = escape_underscores(mdtext)
2244
+ if escaping_underscores:
2245
+ mdtext = escape_underscores(mdtext)
2230
2246
 
2231
2247
  return mdtext
2232
2248
 
@@ -2244,6 +2260,7 @@ class DoclingDocument(BaseModel):
2244
2260
  to_element,
2245
2261
  labels,
2246
2262
  strict_text=True,
2263
+ escaping_underscores=False,
2247
2264
  image_placeholder="",
2248
2265
  )
2249
2266
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.15.1"
3
+ version = "3.0.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -1,178 +0,0 @@
1
- """Models for the base data types."""
2
-
3
- import copy
4
- from enum import Enum
5
- from typing import Tuple
6
-
7
- from pydantic import BaseModel
8
-
9
-
10
- class ImageRefMode(str, Enum):
11
- """ImageRefMode."""
12
-
13
- PLACEHOLDER = "placeholder" # just a place-holder
14
- EMBEDDED = "embedded" # embed the image as a base64
15
- REFERENCED = "referenced" # reference the image via uri
16
-
17
-
18
- class CoordOrigin(str, Enum):
19
- """CoordOrigin."""
20
-
21
- TOPLEFT = "TOPLEFT"
22
- BOTTOMLEFT = "BOTTOMLEFT"
23
-
24
-
25
- class Size(BaseModel):
26
- """Size."""
27
-
28
- width: float = 0.0
29
- height: float = 0.0
30
-
31
- def as_tuple(self):
32
- """as_tuple."""
33
- return (self.width, self.height)
34
-
35
-
36
- class BoundingBox(BaseModel):
37
- """BoundingBox."""
38
-
39
- l: float # left
40
- t: float # top
41
- r: float # right
42
- b: float # bottom
43
-
44
- coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
45
-
46
- @property
47
- def width(self):
48
- """width."""
49
- return self.r - self.l
50
-
51
- @property
52
- def height(self):
53
- """height."""
54
- return abs(self.t - self.b)
55
-
56
- def scaled(self, scale: float) -> "BoundingBox":
57
- """scaled.
58
-
59
- :param scale: float:
60
-
61
- """
62
- out_bbox = copy.deepcopy(self)
63
- out_bbox.l *= scale
64
- out_bbox.r *= scale
65
- out_bbox.t *= scale
66
- out_bbox.b *= scale
67
-
68
- return out_bbox
69
-
70
- def normalized(self, page_size: Size) -> "BoundingBox":
71
- """normalized.
72
-
73
- :param page_size: Size:
74
-
75
- """
76
- out_bbox = copy.deepcopy(self)
77
- out_bbox.l /= page_size.width
78
- out_bbox.r /= page_size.width
79
- out_bbox.t /= page_size.height
80
- out_bbox.b /= page_size.height
81
-
82
- return out_bbox
83
-
84
- def as_tuple(self) -> Tuple[float, float, float, float]:
85
- """as_tuple."""
86
- if self.coord_origin == CoordOrigin.TOPLEFT:
87
- return (self.l, self.t, self.r, self.b)
88
- elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
89
- return (self.l, self.b, self.r, self.t)
90
-
91
- @classmethod
92
- def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
93
- """from_tuple.
94
-
95
- :param coord: Tuple[float:
96
- :param ...]:
97
- :param origin: CoordOrigin:
98
-
99
- """
100
- if origin == CoordOrigin.TOPLEFT:
101
- l, t, r, b = coord[0], coord[1], coord[2], coord[3]
102
- if r < l:
103
- l, r = r, l
104
- if b < t:
105
- b, t = t, b
106
-
107
- return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
108
- elif origin == CoordOrigin.BOTTOMLEFT:
109
- l, b, r, t = coord[0], coord[1], coord[2], coord[3]
110
- if r < l:
111
- l, r = r, l
112
- if b > t:
113
- b, t = t, b
114
-
115
- return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
116
-
117
- def area(self) -> float:
118
- """area."""
119
- area = (self.r - self.l) * (self.b - self.t)
120
- if self.coord_origin == CoordOrigin.BOTTOMLEFT:
121
- area = -area
122
- return area
123
-
124
- def intersection_area_with(self, other: "BoundingBox") -> float:
125
- """intersection_area_with.
126
-
127
- :param other: "BoundingBox":
128
-
129
- """
130
- # Calculate intersection coordinates
131
- left = max(self.l, other.l)
132
- top = max(self.t, other.t)
133
- right = min(self.r, other.r)
134
- bottom = min(self.b, other.b)
135
-
136
- # Calculate intersection dimensions
137
- width = right - left
138
- height = bottom - top
139
-
140
- # If the bounding boxes do not overlap, width or height will be negative
141
- if width <= 0 or height <= 0:
142
- return 0.0
143
-
144
- return width * height
145
-
146
- def to_bottom_left_origin(self, page_height: float) -> "BoundingBox":
147
- """to_bottom_left_origin.
148
-
149
- :param page_height:
150
-
151
- """
152
- if self.coord_origin == CoordOrigin.BOTTOMLEFT:
153
- return self.model_copy()
154
- elif self.coord_origin == CoordOrigin.TOPLEFT:
155
- return BoundingBox(
156
- l=self.l,
157
- r=self.r,
158
- t=page_height - self.t,
159
- b=page_height - self.b,
160
- coord_origin=CoordOrigin.BOTTOMLEFT,
161
- )
162
-
163
- def to_top_left_origin(self, page_height: float) -> "BoundingBox":
164
- """to_top_left_origin.
165
-
166
- :param page_height:
167
-
168
- """
169
- if self.coord_origin == CoordOrigin.TOPLEFT:
170
- return self.model_copy()
171
- elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
172
- return BoundingBox(
173
- l=self.l,
174
- r=self.r,
175
- t=page_height - self.t, # self.b
176
- b=page_height - self.b, # self.t
177
- coord_origin=CoordOrigin.TOPLEFT,
178
- )
File without changes
File without changes