docling-core 1.7.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (36) hide show
  1. docling_core/transforms/chunker/__init__.py +2 -8
  2. docling_core/transforms/chunker/base.py +27 -40
  3. docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
  4. docling_core/types/__init__.py +3 -18
  5. docling_core/types/doc/__init__.py +25 -0
  6. docling_core/types/doc/base.py +136 -451
  7. docling_core/types/doc/document.py +1289 -559
  8. docling_core/types/{experimental → doc}/labels.py +4 -1
  9. docling_core/types/legacy_doc/__init__.py +6 -0
  10. docling_core/types/legacy_doc/base.py +485 -0
  11. docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
  12. docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
  13. docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
  14. docling_core/types/legacy_doc/document.py +715 -0
  15. docling_core/types/rec/subject.py +1 -1
  16. docling_core/utils/generate_docs.py +82 -0
  17. docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
  18. docling_core/utils/validators.py +3 -3
  19. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/METADATA +17 -17
  20. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/RECORD +24 -31
  21. docling_core-2.0.1.dist-info/entry_points.txt +5 -0
  22. docling_core/transforms/id_generator/__init__.py +0 -12
  23. docling_core/transforms/id_generator/base.py +0 -30
  24. docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
  25. docling_core/transforms/id_generator/uuid_generator.py +0 -34
  26. docling_core/transforms/metadata_extractor/__init__.py +0 -13
  27. docling_core/transforms/metadata_extractor/base.py +0 -59
  28. docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
  29. docling_core/types/experimental/__init__.py +0 -30
  30. docling_core/types/experimental/base.py +0 -167
  31. docling_core/types/experimental/document.py +0 -1192
  32. docling_core/utils/ds_generate_docs.py +0 -144
  33. docling_core-1.7.2.dist-info/entry_points.txt +0 -5
  34. /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
  35. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/LICENSE +0 -0
  36. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/WHEEL +0 -0
@@ -1,485 +1,170 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
1
+ """Models for the base data types."""
5
2
 
6
- """Define common models across CCS objects."""
7
- from typing import Annotated, List, Literal, Optional, Union
3
+ import copy
4
+ from enum import Enum
5
+ from typing import Tuple
8
6
 
9
- import pandas as pd
10
- from pydantic import BaseModel, Field, PositiveInt, StrictStr
7
+ from pydantic import BaseModel
11
8
 
12
- from docling_core.search.mapping import es_field
13
- from docling_core.types.doc.tokens import DocumentToken
14
- from docling_core.utils.alias import AliasModel
15
9
 
16
- CellData = tuple[float, float, float, float, str, str]
10
+ class CoordOrigin(str, Enum):
11
+ """CoordOrigin."""
17
12
 
18
- CellHeader = tuple[
19
- Literal["x0"],
20
- Literal["y0"],
21
- Literal["x1"],
22
- Literal["y1"],
23
- Literal["font"],
24
- Literal["text"],
25
- ]
13
+ TOPLEFT = "TOPLEFT"
14
+ BOTTOMLEFT = "BOTTOMLEFT"
26
15
 
27
- BoundingBox = Annotated[list[float], Field(min_length=4, max_length=4)]
28
16
 
29
- Span = Annotated[list[int], Field(min_length=2, max_length=2)]
17
+ class Size(BaseModel):
18
+ """Size."""
30
19
 
20
+ width: float = 0.0
21
+ height: float = 0.0
31
22
 
32
- class CellsContainer(BaseModel):
33
- """Cell container."""
23
+ def as_tuple(self):
24
+ """as_tuple."""
25
+ return (self.width, self.height)
34
26
 
35
- data: Optional[list[CellData]] = None
36
- header: CellHeader = ("x0", "y0", "x1", "y1", "font", "text")
37
27
 
28
+ class BoundingBox(BaseModel):
29
+ """BoundingBox."""
38
30
 
39
- class S3Resource(BaseModel):
40
- """Resource in a cloud object storage."""
31
+ l: float # left
32
+ t: float # top
33
+ r: float # right
34
+ b: float # bottom
41
35
 
42
- mime: str
43
- path: str
44
- page: Optional[PositiveInt] = None
36
+ coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
45
37
 
38
+ @property
39
+ def width(self):
40
+ """width."""
41
+ return self.r - self.l
46
42
 
47
- class S3Data(AliasModel):
48
- """Data object in a cloud object storage."""
43
+ @property
44
+ def height(self):
45
+ """height."""
46
+ return abs(self.t - self.b)
49
47
 
50
- pdf_document: Optional[list[S3Resource]] = Field(default=None, alias="pdf-document")
51
- pdf_pages: Optional[list[S3Resource]] = Field(default=None, alias="pdf-pages")
52
- pdf_images: Optional[list[S3Resource]] = Field(default=None, alias="pdf-images")
53
- json_document: Optional[S3Resource] = Field(default=None, alias="json-document")
54
- json_meta: Optional[S3Resource] = Field(default=None, alias="json-meta")
55
- glm_json_document: Optional[S3Resource] = Field(
56
- default=None, alias="glm-json-document"
57
- )
58
- figures: Optional[list[S3Resource]] = None
48
+ def scaled(self, scale: float) -> "BoundingBox":
49
+ """scaled.
59
50
 
51
+ :param scale: float:
60
52
 
61
- class S3Reference(AliasModel):
62
- """References an s3 resource."""
53
+ """
54
+ out_bbox = copy.deepcopy(self)
55
+ out_bbox.l *= scale
56
+ out_bbox.r *= scale
57
+ out_bbox.t *= scale
58
+ out_bbox.b *= scale
63
59
 
64
- ref_s3_data: StrictStr = Field(
65
- alias="__ref_s3_data", examples=["#/_s3_data/figures/0"]
66
- )
60
+ return out_bbox
67
61
 
62
+ def normalized(self, page_size: Size) -> "BoundingBox":
63
+ """normalized.
68
64
 
69
- class Prov(AliasModel):
70
- """Provenance."""
65
+ :param page_size: Size:
71
66
 
72
- bbox: BoundingBox
73
- page: PositiveInt
74
- span: Span
75
- ref_s3_data: Optional[StrictStr] = Field(
76
- default=None, alias="__ref_s3_data", json_schema_extra=es_field(suppress=True)
77
- )
67
+ """
68
+ out_bbox = copy.deepcopy(self)
69
+ out_bbox.l /= page_size.width
70
+ out_bbox.r /= page_size.width
71
+ out_bbox.t /= page_size.height
72
+ out_bbox.b /= page_size.height
78
73
 
74
+ return out_bbox
79
75
 
80
- class BoundingBoxContainer(BaseModel):
81
- """Bounding box container."""
76
+ def as_tuple(self):
77
+ """as_tuple."""
78
+ if self.coord_origin == CoordOrigin.TOPLEFT:
79
+ return (self.l, self.t, self.r, self.b)
80
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
81
+ return (self.l, self.b, self.r, self.t)
82
82
 
83
- min: BoundingBox
84
- max: BoundingBox
83
+ @classmethod
84
+ def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
85
+ """from_tuple.
85
86
 
87
+ :param coord: Tuple[float:
88
+ :param ...]:
89
+ :param origin: CoordOrigin:
86
90
 
87
- class BitmapObject(AliasModel):
88
- """Bitmap object."""
91
+ """
92
+ if origin == CoordOrigin.TOPLEFT:
93
+ l, t, r, b = coord[0], coord[1], coord[2], coord[3]
94
+ if r < l:
95
+ l, r = r, l
96
+ if b < t:
97
+ b, t = t, b
89
98
 
90
- obj_type: str = Field(alias="type")
91
- bounding_box: BoundingBoxContainer = Field(
92
- json_schema_extra=es_field(suppress=True)
93
- )
94
- prov: Prov
99
+ return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
100
+ elif origin == CoordOrigin.BOTTOMLEFT:
101
+ l, b, r, t = coord[0], coord[1], coord[2], coord[3]
102
+ if r < l:
103
+ l, r = r, l
104
+ if b > t:
105
+ b, t = t, b
95
106
 
107
+ return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
96
108
 
97
- class PageDimensions(BaseModel):
98
- """Page dimensions."""
109
+ def area(self) -> float:
110
+ """area."""
111
+ area = (self.r - self.l) * (self.b - self.t)
112
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
113
+ area = -area
114
+ return area
99
115
 
100
- height: float
101
- page: PositiveInt
102
- width: float
103
-
104
-
105
- class TableCell(AliasModel):
106
- """Table cell."""
107
-
108
- bbox: Optional[BoundingBox] = None
109
- spans: Optional[list[Span]] = None
110
- text: str = Field(json_schema_extra=es_field(term_vector="with_positions_offsets"))
111
- obj_type: str = Field(alias="type")
112
-
113
-
114
- class GlmTableCell(TableCell):
115
- """Glm Table cell."""
116
-
117
- col: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
118
- col_header: bool = Field(
119
- default=False, alias="col-header", json_schema_extra=es_field(suppress=True)
120
- )
121
- col_span: Optional[Span] = Field(
122
- default=None, alias="col-span", json_schema_extra=es_field(suppress=True)
123
- )
124
- row: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
125
- row_header: bool = Field(
126
- default=False, alias="row-header", json_schema_extra=es_field(suppress=True)
127
- )
128
- row_span: Optional[Span] = Field(
129
- default=None, alias="row-span", json_schema_extra=es_field(suppress=True)
130
- )
131
-
132
-
133
- class BaseCell(AliasModel):
134
- """Base cell."""
135
-
136
- prov: Optional[list[Prov]] = None
137
- text: Optional[str] = Field(
138
- default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
139
- )
140
- obj_type: str = Field(
141
- alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
142
- )
143
-
144
- def get_location_tokens(
145
- self,
146
- new_line: str,
147
- page_w: float,
148
- page_h: float,
149
- xsize: int = 100,
150
- ysize: int = 100,
151
- add_page_index: bool = True,
152
- ) -> str:
153
- """Get the location string for the BaseCell."""
154
- if self.prov is None:
155
- return ""
156
-
157
- location = ""
158
- for prov in self.prov:
159
-
160
- page_i = -1
161
- if add_page_index:
162
- page_i = prov.page
163
-
164
- loc_str = DocumentToken.get_location(
165
- bbox=prov.bbox,
166
- page_w=page_w,
167
- page_h=page_h,
168
- xsize=xsize,
169
- ysize=ysize,
170
- page_i=page_i,
171
- )
172
- location += f"{loc_str}{new_line}"
173
-
174
- return location
175
-
176
-
177
- class Table(BaseCell):
178
- """Table."""
179
-
180
- num_cols: int = Field(alias="#-cols")
181
- num_rows: int = Field(alias="#-rows")
182
- data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
183
- model: Optional[str] = None
184
-
185
- # FIXME: we need to check why we have bounding_box (this should be in prov)
186
- bounding_box: Optional[BoundingBoxContainer] = Field(
187
- default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
188
- )
189
-
190
- def _get_tablecell_span(self, cell: TableCell, ix: int):
191
- if cell.spans is None:
192
- span = set()
193
- else:
194
- span = set([s[ix] for s in cell.spans])
195
- if len(span) == 0:
196
- return 1, None, None
197
- return len(span), min(span), max(span)
198
-
199
- def export_to_dataframe(self) -> pd.DataFrame:
200
- """Export the table as a Pandas DataFrame."""
201
- if self.data is None or self.num_rows == 0 or self.num_cols == 0:
202
- return pd.DataFrame()
203
-
204
- # Count how many rows are column headers
205
- num_headers = 0
206
- for i, row in enumerate(self.data):
207
- if len(row) == 0:
208
- raise RuntimeError(f"Invalid table. {len(row)=} but {self.num_cols=}.")
209
-
210
- any_header = False
211
- for cell in row:
212
- if cell.obj_type == "col_header":
213
- any_header = True
214
- break
215
-
216
- if any_header:
217
- num_headers += 1
218
- else:
219
- break
220
-
221
- # Create the column names from all col_headers
222
- columns: Optional[List[str]] = None
223
- if num_headers > 0:
224
- columns = ["" for _ in range(self.num_cols)]
225
- for i in range(num_headers):
226
- for j, cell in enumerate(self.data[i]):
227
- col_name = cell.text
228
- if columns[j] != "":
229
- col_name = f".{col_name}"
230
- columns[j] += col_name
231
-
232
- # Create table data
233
- table_data = [[cell.text for cell in row] for row in self.data[num_headers:]]
234
-
235
- # Create DataFrame
236
- df = pd.DataFrame(table_data, columns=columns)
237
-
238
- return df
239
-
240
- def export_to_html(self) -> str:
241
- """Export the table as html."""
242
- body = ""
243
- nrows = self.num_rows
244
- ncols = self.num_cols
245
-
246
- if self.data is None:
247
- return ""
248
- for i in range(nrows):
249
- body += "<tr>"
250
- for j in range(ncols):
251
- cell: TableCell = self.data[i][j]
252
-
253
- rowspan, rowstart, rowend = self._get_tablecell_span(cell, 0)
254
- colspan, colstart, colend = self._get_tablecell_span(cell, 1)
255
-
256
- if rowstart is not None and rowstart != i:
257
- continue
258
- if colstart is not None and colstart != j:
259
- continue
260
-
261
- if rowstart is None:
262
- rowstart = i
263
- if colstart is None:
264
- colstart = j
265
-
266
- content = cell.text.strip()
267
- label = cell.obj_type
268
- celltag = "td"
269
- if label in ["row_header", "row_multi_header", "row_title"]:
270
- pass
271
- elif label in ["col_header", "col_multi_header"]:
272
- celltag = "th"
273
-
274
- opening_tag = f"{celltag}"
275
- if rowspan > 1:
276
- opening_tag += f' rowspan="{rowspan}"'
277
- if colspan > 1:
278
- opening_tag += f' colspan="{colspan}"'
279
-
280
- body += f"<{opening_tag}>{content}</{celltag}>"
281
- body += "</tr>"
282
- body = f"<table>{body}</table>"
283
-
284
- return body
285
-
286
- def export_to_document_tokens(
287
- self,
288
- new_line: str = "\n",
289
- page_w: float = 0.0,
290
- page_h: float = 0.0,
291
- xsize: int = 100,
292
- ysize: int = 100,
293
- add_location: bool = True,
294
- add_caption: bool = True,
295
- add_content: bool = True,
296
- add_cell_location: bool = True,
297
- add_cell_label: bool = True,
298
- add_cell_text: bool = True,
299
- add_page_index: bool = True,
300
- ):
301
- """Export table to document tokens format."""
302
- body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
303
-
304
- if add_location:
305
- body += self.get_location_tokens(
306
- new_line=new_line,
307
- page_w=page_w,
308
- page_h=page_h,
309
- xsize=xsize,
310
- ysize=ysize,
311
- add_page_index=add_page_index,
312
- )
313
-
314
- if add_caption and self.text is not None and len(self.text) > 0:
315
- body += f"{DocumentToken.BEG_CAPTION.value}"
316
- body += f"{self.text.strip()}"
317
- body += f"{DocumentToken.END_CAPTION.value}"
318
- body += f"{new_line}"
319
-
320
- if add_content and self.data is not None and len(self.data) > 0:
321
- for i, row in enumerate(self.data):
322
- body += f"<row_{i}>"
323
- for j, col in enumerate(row):
324
-
325
- text = ""
326
- if add_cell_text:
327
- text = col.text.strip()
328
-
329
- cell_loc = ""
330
- if (
331
- col.bbox is not None
332
- and add_cell_location
333
- and add_page_index
334
- and self.prov is not None
335
- and len(self.prov) > 0
336
- ):
337
- cell_loc = DocumentToken.get_location(
338
- bbox=col.bbox,
339
- page_w=page_w,
340
- page_h=page_h,
341
- xsize=xsize,
342
- ysize=ysize,
343
- page_i=self.prov[0].page,
344
- )
345
- elif (
346
- col.bbox is not None
347
- and add_cell_location
348
- and not add_page_index
349
- ):
350
- cell_loc = DocumentToken.get_location(
351
- bbox=col.bbox,
352
- page_w=page_w,
353
- page_h=page_h,
354
- xsize=xsize,
355
- ysize=ysize,
356
- page_i=-1,
357
- )
358
-
359
- cell_label = ""
360
- if (
361
- add_cell_label
362
- and col.obj_type is not None
363
- and len(col.obj_type) > 0
364
- ):
365
- cell_label = f"<{col.obj_type}>"
366
-
367
- body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
368
-
369
- body += f"</row_{i}>{new_line}"
370
-
371
- body += f"{DocumentToken.END_TABLE.value}{new_line}"
372
-
373
- return body
374
-
375
-
376
- # FIXME: let's add some figure specific data-types later
377
- class Figure(BaseCell):
378
- """Figure."""
379
-
380
- # FIXME: we need to check why we have bounding_box (this should be in prov)
381
- bounding_box: Optional[BoundingBoxContainer] = Field(
382
- default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
383
- )
384
-
385
- def export_to_document_tokens(
386
- self,
387
- new_line: str = "\n",
388
- page_w: float = 0.0,
389
- page_h: float = 0.0,
390
- xsize: int = 100,
391
- ysize: int = 100,
392
- add_location: bool = True,
393
- add_caption: bool = True,
394
- add_content: bool = True, # not used at the moment
395
- add_page_index: bool = True,
396
- ):
397
- """Export figure to document tokens format."""
398
- body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
399
-
400
- if add_location:
401
- body += self.get_location_tokens(
402
- new_line=new_line,
403
- page_w=page_w,
404
- page_h=page_h,
405
- xsize=xsize,
406
- ysize=ysize,
407
- add_page_index=add_page_index,
116
+ def intersection_area_with(self, other: "BoundingBox") -> float:
117
+ """intersection_area_with.
118
+
119
+ :param other: "BoundingBox":
120
+
121
+ """
122
+ # Calculate intersection coordinates
123
+ left = max(self.l, other.l)
124
+ top = max(self.t, other.t)
125
+ right = min(self.r, other.r)
126
+ bottom = min(self.b, other.b)
127
+
128
+ # Calculate intersection dimensions
129
+ width = right - left
130
+ height = bottom - top
131
+
132
+ # If the bounding boxes do not overlap, width or height will be negative
133
+ if width <= 0 or height <= 0:
134
+ return 0.0
135
+
136
+ return width * height
137
+
138
+ def to_bottom_left_origin(self, page_height) -> "BoundingBox":
139
+ """to_bottom_left_origin.
140
+
141
+ :param page_height:
142
+
143
+ """
144
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
145
+ return self
146
+ elif self.coord_origin == CoordOrigin.TOPLEFT:
147
+ return BoundingBox(
148
+ l=self.l,
149
+ r=self.r,
150
+ t=page_height - self.t,
151
+ b=page_height - self.b,
152
+ coord_origin=CoordOrigin.BOTTOMLEFT,
408
153
  )
409
154
 
410
- if add_caption and self.text is not None and len(self.text) > 0:
411
- body += f"{DocumentToken.BEG_CAPTION.value}"
412
- body += f"{self.text.strip()}"
413
- body += f"{DocumentToken.END_CAPTION.value}"
414
- body += f"{new_line}"
415
-
416
- body += f"{DocumentToken.END_FIGURE.value}{new_line}"
417
-
418
- return body
419
-
420
-
421
- class BaseText(BaseCell):
422
- """Base model for text objects."""
423
-
424
- # FIXME: do we need these ???
425
- name: Optional[StrictStr] = Field(
426
- default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
427
- )
428
- font: Optional[str] = None
429
-
430
- def export_to_document_tokens(
431
- self,
432
- new_line: str = "\n",
433
- page_w: float = 0.0,
434
- page_h: float = 0.0,
435
- xsize: int = 100,
436
- ysize: int = 100,
437
- add_location: bool = True,
438
- add_content: bool = True,
439
- add_page_index: bool = True,
440
- ):
441
- """Export text element to document tokens format."""
442
- body = f"<{self.obj_type}>"
443
-
444
- assert DocumentToken.is_known_token(
445
- body
446
- ), f"failed DocumentToken.is_known_token({body})"
447
-
448
- if add_location:
449
- body += self.get_location_tokens(
450
- new_line="",
451
- page_w=page_w,
452
- page_h=page_h,
453
- xsize=xsize,
454
- ysize=ysize,
455
- add_page_index=add_page_index,
155
+ def to_top_left_origin(self, page_height):
156
+ """to_top_left_origin.
157
+
158
+ :param page_height:
159
+
160
+ """
161
+ if self.coord_origin == CoordOrigin.TOPLEFT:
162
+ return self
163
+ elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
164
+ return BoundingBox(
165
+ l=self.l,
166
+ r=self.r,
167
+ t=page_height - self.t, # self.b
168
+ b=page_height - self.b, # self.t
169
+ coord_origin=CoordOrigin.TOPLEFT,
456
170
  )
457
-
458
- if add_content and self.text is not None:
459
- body += self.text.strip()
460
-
461
- body += f"</{self.obj_type}>{new_line}"
462
-
463
- return body
464
-
465
-
466
- class ListItem(BaseText):
467
- """List item."""
468
-
469
- identifier: str
470
-
471
-
472
- class Ref(AliasModel):
473
- """Reference."""
474
-
475
- name: str
476
- obj_type: str = Field(alias="type")
477
- ref: str = Field(alias="$ref")
478
-
479
-
480
- class PageReference(BaseModel):
481
- """Page reference."""
482
-
483
- hash: str = Field(json_schema_extra=es_field(type="keyword", ignore_above=8191))
484
- model: str = Field(json_schema_extra=es_field(suppress=True))
485
- page: PositiveInt = Field(json_schema_extra=es_field(type="short"))