docling-core 1.7.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (36) hide show
  1. docling_core/transforms/chunker/__init__.py +2 -8
  2. docling_core/transforms/chunker/base.py +27 -40
  3. docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
  4. docling_core/types/__init__.py +3 -18
  5. docling_core/types/doc/__init__.py +25 -0
  6. docling_core/types/doc/base.py +136 -451
  7. docling_core/types/doc/document.py +1289 -559
  8. docling_core/types/{experimental → doc}/labels.py +4 -1
  9. docling_core/types/legacy_doc/__init__.py +6 -0
  10. docling_core/types/legacy_doc/base.py +485 -0
  11. docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
  12. docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
  13. docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
  14. docling_core/types/legacy_doc/document.py +715 -0
  15. docling_core/types/rec/subject.py +1 -1
  16. docling_core/utils/generate_docs.py +82 -0
  17. docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
  18. docling_core/utils/validators.py +3 -3
  19. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/METADATA +17 -17
  20. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/RECORD +24 -31
  21. docling_core-2.0.1.dist-info/entry_points.txt +5 -0
  22. docling_core/transforms/id_generator/__init__.py +0 -12
  23. docling_core/transforms/id_generator/base.py +0 -30
  24. docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
  25. docling_core/transforms/id_generator/uuid_generator.py +0 -34
  26. docling_core/transforms/metadata_extractor/__init__.py +0 -13
  27. docling_core/transforms/metadata_extractor/base.py +0 -59
  28. docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
  29. docling_core/types/experimental/__init__.py +0 -30
  30. docling_core/types/experimental/base.py +0 -167
  31. docling_core/types/experimental/document.py +0 -1192
  32. docling_core/utils/ds_generate_docs.py +0 -144
  33. docling_core-1.7.2.dist-info/entry_points.txt +0 -5
  34. /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
  35. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/LICENSE +0 -0
  36. {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/WHEEL +0 -0
@@ -34,7 +34,10 @@ class GroupLabel(str, Enum):
34
34
  """GroupLabel."""
35
35
 
36
36
  UNSPECIFIED = "unspecified"
37
- LIST = "list" # group label for list container (not the list-items)
37
+ LIST = (
38
+ "list" # group label for list container (not the list-items) (e.g. HTML <ul/>)
39
+ )
40
+ ORDERED_LIST = "ordered_list" # List with enumeration (e.g. HTML <ol/>)
38
41
  CHAPTER = "chapter"
39
42
  SECTION = "section"
40
43
  SHEET = "sheet"
@@ -0,0 +1,6 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Package for models defined by the Document type."""
@@ -0,0 +1,485 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define common models across CCS objects."""
7
+ from typing import Annotated, List, Literal, Optional, Union
8
+
9
+ import pandas as pd
10
+ from pydantic import BaseModel, Field, PositiveInt, StrictStr
11
+
12
+ from docling_core.search.mapping import es_field
13
+ from docling_core.types.legacy_doc.tokens import DocumentToken
14
+ from docling_core.utils.alias import AliasModel
15
+
16
+ CellData = tuple[float, float, float, float, str, str]
17
+
18
+ CellHeader = tuple[
19
+ Literal["x0"],
20
+ Literal["y0"],
21
+ Literal["x1"],
22
+ Literal["y1"],
23
+ Literal["font"],
24
+ Literal["text"],
25
+ ]
26
+
27
+ BoundingBox = Annotated[list[float], Field(min_length=4, max_length=4)]
28
+
29
+ Span = Annotated[list[int], Field(min_length=2, max_length=2)]
30
+
31
+
32
+ class CellsContainer(BaseModel):
33
+ """Cell container."""
34
+
35
+ data: Optional[list[CellData]] = None
36
+ header: CellHeader = ("x0", "y0", "x1", "y1", "font", "text")
37
+
38
+
39
+ class S3Resource(BaseModel):
40
+ """Resource in a cloud object storage."""
41
+
42
+ mime: str
43
+ path: str
44
+ page: Optional[PositiveInt] = None
45
+
46
+
47
+ class S3Data(AliasModel):
48
+ """Data object in a cloud object storage."""
49
+
50
+ pdf_document: Optional[list[S3Resource]] = Field(default=None, alias="pdf-document")
51
+ pdf_pages: Optional[list[S3Resource]] = Field(default=None, alias="pdf-pages")
52
+ pdf_images: Optional[list[S3Resource]] = Field(default=None, alias="pdf-images")
53
+ json_document: Optional[S3Resource] = Field(default=None, alias="json-document")
54
+ json_meta: Optional[S3Resource] = Field(default=None, alias="json-meta")
55
+ glm_json_document: Optional[S3Resource] = Field(
56
+ default=None, alias="glm-json-document"
57
+ )
58
+ figures: Optional[list[S3Resource]] = None
59
+
60
+
61
+ class S3Reference(AliasModel):
62
+ """References an s3 resource."""
63
+
64
+ ref_s3_data: StrictStr = Field(
65
+ alias="__ref_s3_data", examples=["#/_s3_data/figures/0"]
66
+ )
67
+
68
+
69
+ class Prov(AliasModel):
70
+ """Provenance."""
71
+
72
+ bbox: BoundingBox
73
+ page: PositiveInt
74
+ span: Span
75
+ ref_s3_data: Optional[StrictStr] = Field(
76
+ default=None, alias="__ref_s3_data", json_schema_extra=es_field(suppress=True)
77
+ )
78
+
79
+
80
+ class BoundingBoxContainer(BaseModel):
81
+ """Bounding box container."""
82
+
83
+ min: BoundingBox
84
+ max: BoundingBox
85
+
86
+
87
+ class BitmapObject(AliasModel):
88
+ """Bitmap object."""
89
+
90
+ obj_type: str = Field(alias="type")
91
+ bounding_box: BoundingBoxContainer = Field(
92
+ json_schema_extra=es_field(suppress=True)
93
+ )
94
+ prov: Prov
95
+
96
+
97
+ class PageDimensions(BaseModel):
98
+ """Page dimensions."""
99
+
100
+ height: float
101
+ page: PositiveInt
102
+ width: float
103
+
104
+
105
+ class TableCell(AliasModel):
106
+ """Table cell."""
107
+
108
+ bbox: Optional[BoundingBox] = None
109
+ spans: Optional[list[Span]] = None
110
+ text: str = Field(json_schema_extra=es_field(term_vector="with_positions_offsets"))
111
+ obj_type: str = Field(alias="type")
112
+
113
+
114
+ class GlmTableCell(TableCell):
115
+ """Glm Table cell."""
116
+
117
+ col: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
118
+ col_header: bool = Field(
119
+ default=False, alias="col-header", json_schema_extra=es_field(suppress=True)
120
+ )
121
+ col_span: Optional[Span] = Field(
122
+ default=None, alias="col-span", json_schema_extra=es_field(suppress=True)
123
+ )
124
+ row: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
125
+ row_header: bool = Field(
126
+ default=False, alias="row-header", json_schema_extra=es_field(suppress=True)
127
+ )
128
+ row_span: Optional[Span] = Field(
129
+ default=None, alias="row-span", json_schema_extra=es_field(suppress=True)
130
+ )
131
+
132
+
133
+ class BaseCell(AliasModel):
134
+ """Base cell."""
135
+
136
+ prov: Optional[list[Prov]] = None
137
+ text: Optional[str] = Field(
138
+ default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
139
+ )
140
+ obj_type: str = Field(
141
+ alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
142
+ )
143
+
144
+ def get_location_tokens(
145
+ self,
146
+ new_line: str,
147
+ page_w: float,
148
+ page_h: float,
149
+ xsize: int = 100,
150
+ ysize: int = 100,
151
+ add_page_index: bool = True,
152
+ ) -> str:
153
+ """Get the location string for the BaseCell."""
154
+ if self.prov is None:
155
+ return ""
156
+
157
+ location = ""
158
+ for prov in self.prov:
159
+
160
+ page_i = -1
161
+ if add_page_index:
162
+ page_i = prov.page
163
+
164
+ loc_str = DocumentToken.get_location(
165
+ bbox=prov.bbox,
166
+ page_w=page_w,
167
+ page_h=page_h,
168
+ xsize=xsize,
169
+ ysize=ysize,
170
+ page_i=page_i,
171
+ )
172
+ location += f"{loc_str}{new_line}"
173
+
174
+ return location
175
+
176
+
177
+ class Table(BaseCell):
178
+ """Table."""
179
+
180
+ num_cols: int = Field(alias="#-cols")
181
+ num_rows: int = Field(alias="#-rows")
182
+ data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
183
+ model: Optional[str] = None
184
+
185
+ # FIXME: we need to check why we have bounding_box (this should be in prov)
186
+ bounding_box: Optional[BoundingBoxContainer] = Field(
187
+ default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
188
+ )
189
+
190
+ def _get_tablecell_span(self, cell: TableCell, ix: int):
191
+ if cell.spans is None:
192
+ span = set()
193
+ else:
194
+ span = set([s[ix] for s in cell.spans])
195
+ if len(span) == 0:
196
+ return 1, None, None
197
+ return len(span), min(span), max(span)
198
+
199
+ def export_to_dataframe(self) -> pd.DataFrame:
200
+ """Export the table as a Pandas DataFrame."""
201
+ if self.data is None or self.num_rows == 0 or self.num_cols == 0:
202
+ return pd.DataFrame()
203
+
204
+ # Count how many rows are column headers
205
+ num_headers = 0
206
+ for i, row in enumerate(self.data):
207
+ if len(row) == 0:
208
+ raise RuntimeError(f"Invalid table. {len(row)=} but {self.num_cols=}.")
209
+
210
+ any_header = False
211
+ for cell in row:
212
+ if cell.obj_type == "col_header":
213
+ any_header = True
214
+ break
215
+
216
+ if any_header:
217
+ num_headers += 1
218
+ else:
219
+ break
220
+
221
+ # Create the column names from all col_headers
222
+ columns: Optional[List[str]] = None
223
+ if num_headers > 0:
224
+ columns = ["" for _ in range(self.num_cols)]
225
+ for i in range(num_headers):
226
+ for j, cell in enumerate(self.data[i]):
227
+ col_name = cell.text
228
+ if columns[j] != "":
229
+ col_name = f".{col_name}"
230
+ columns[j] += col_name
231
+
232
+ # Create table data
233
+ table_data = [[cell.text for cell in row] for row in self.data[num_headers:]]
234
+
235
+ # Create DataFrame
236
+ df = pd.DataFrame(table_data, columns=columns)
237
+
238
+ return df
239
+
240
+ def export_to_html(self) -> str:
241
+ """Export the table as html."""
242
+ body = ""
243
+ nrows = self.num_rows
244
+ ncols = self.num_cols
245
+
246
+ if self.data is None:
247
+ return ""
248
+ for i in range(nrows):
249
+ body += "<tr>"
250
+ for j in range(ncols):
251
+ cell: TableCell = self.data[i][j]
252
+
253
+ rowspan, rowstart, rowend = self._get_tablecell_span(cell, 0)
254
+ colspan, colstart, colend = self._get_tablecell_span(cell, 1)
255
+
256
+ if rowstart is not None and rowstart != i:
257
+ continue
258
+ if colstart is not None and colstart != j:
259
+ continue
260
+
261
+ if rowstart is None:
262
+ rowstart = i
263
+ if colstart is None:
264
+ colstart = j
265
+
266
+ content = cell.text.strip()
267
+ label = cell.obj_type
268
+ celltag = "td"
269
+ if label in ["row_header", "row_multi_header", "row_title"]:
270
+ pass
271
+ elif label in ["col_header", "col_multi_header"]:
272
+ celltag = "th"
273
+
274
+ opening_tag = f"{celltag}"
275
+ if rowspan > 1:
276
+ opening_tag += f' rowspan="{rowspan}"'
277
+ if colspan > 1:
278
+ opening_tag += f' colspan="{colspan}"'
279
+
280
+ body += f"<{opening_tag}>{content}</{celltag}>"
281
+ body += "</tr>"
282
+ body = f"<table>{body}</table>"
283
+
284
+ return body
285
+
286
+ def export_to_document_tokens(
287
+ self,
288
+ new_line: str = "\n",
289
+ page_w: float = 0.0,
290
+ page_h: float = 0.0,
291
+ xsize: int = 100,
292
+ ysize: int = 100,
293
+ add_location: bool = True,
294
+ add_caption: bool = True,
295
+ add_content: bool = True,
296
+ add_cell_location: bool = True,
297
+ add_cell_label: bool = True,
298
+ add_cell_text: bool = True,
299
+ add_page_index: bool = True,
300
+ ):
301
+ """Export table to document tokens format."""
302
+ body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
303
+
304
+ if add_location:
305
+ body += self.get_location_tokens(
306
+ new_line=new_line,
307
+ page_w=page_w,
308
+ page_h=page_h,
309
+ xsize=xsize,
310
+ ysize=ysize,
311
+ add_page_index=add_page_index,
312
+ )
313
+
314
+ if add_caption and self.text is not None and len(self.text) > 0:
315
+ body += f"{DocumentToken.BEG_CAPTION.value}"
316
+ body += f"{self.text.strip()}"
317
+ body += f"{DocumentToken.END_CAPTION.value}"
318
+ body += f"{new_line}"
319
+
320
+ if add_content and self.data is not None and len(self.data) > 0:
321
+ for i, row in enumerate(self.data):
322
+ body += f"<row_{i}>"
323
+ for j, col in enumerate(row):
324
+
325
+ text = ""
326
+ if add_cell_text:
327
+ text = col.text.strip()
328
+
329
+ cell_loc = ""
330
+ if (
331
+ col.bbox is not None
332
+ and add_cell_location
333
+ and add_page_index
334
+ and self.prov is not None
335
+ and len(self.prov) > 0
336
+ ):
337
+ cell_loc = DocumentToken.get_location(
338
+ bbox=col.bbox,
339
+ page_w=page_w,
340
+ page_h=page_h,
341
+ xsize=xsize,
342
+ ysize=ysize,
343
+ page_i=self.prov[0].page,
344
+ )
345
+ elif (
346
+ col.bbox is not None
347
+ and add_cell_location
348
+ and not add_page_index
349
+ ):
350
+ cell_loc = DocumentToken.get_location(
351
+ bbox=col.bbox,
352
+ page_w=page_w,
353
+ page_h=page_h,
354
+ xsize=xsize,
355
+ ysize=ysize,
356
+ page_i=-1,
357
+ )
358
+
359
+ cell_label = ""
360
+ if (
361
+ add_cell_label
362
+ and col.obj_type is not None
363
+ and len(col.obj_type) > 0
364
+ ):
365
+ cell_label = f"<{col.obj_type}>"
366
+
367
+ body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
368
+
369
+ body += f"</row_{i}>{new_line}"
370
+
371
+ body += f"{DocumentToken.END_TABLE.value}{new_line}"
372
+
373
+ return body
374
+
375
+
376
+ # FIXME: let's add some figure specific data-types later
377
+ class Figure(BaseCell):
378
+ """Figure."""
379
+
380
+ # FIXME: we need to check why we have bounding_box (this should be in prov)
381
+ bounding_box: Optional[BoundingBoxContainer] = Field(
382
+ default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
383
+ )
384
+
385
+ def export_to_document_tokens(
386
+ self,
387
+ new_line: str = "\n",
388
+ page_w: float = 0.0,
389
+ page_h: float = 0.0,
390
+ xsize: int = 100,
391
+ ysize: int = 100,
392
+ add_location: bool = True,
393
+ add_caption: bool = True,
394
+ add_content: bool = True, # not used at the moment
395
+ add_page_index: bool = True,
396
+ ):
397
+ """Export figure to document tokens format."""
398
+ body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
399
+
400
+ if add_location:
401
+ body += self.get_location_tokens(
402
+ new_line=new_line,
403
+ page_w=page_w,
404
+ page_h=page_h,
405
+ xsize=xsize,
406
+ ysize=ysize,
407
+ add_page_index=add_page_index,
408
+ )
409
+
410
+ if add_caption and self.text is not None and len(self.text) > 0:
411
+ body += f"{DocumentToken.BEG_CAPTION.value}"
412
+ body += f"{self.text.strip()}"
413
+ body += f"{DocumentToken.END_CAPTION.value}"
414
+ body += f"{new_line}"
415
+
416
+ body += f"{DocumentToken.END_FIGURE.value}{new_line}"
417
+
418
+ return body
419
+
420
+
421
+ class BaseText(BaseCell):
422
+ """Base model for text objects."""
423
+
424
+ # FIXME: do we need these ???
425
+ name: Optional[StrictStr] = Field(
426
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
427
+ )
428
+ font: Optional[str] = None
429
+
430
+ def export_to_document_tokens(
431
+ self,
432
+ new_line: str = "\n",
433
+ page_w: float = 0.0,
434
+ page_h: float = 0.0,
435
+ xsize: int = 100,
436
+ ysize: int = 100,
437
+ add_location: bool = True,
438
+ add_content: bool = True,
439
+ add_page_index: bool = True,
440
+ ):
441
+ """Export text element to document tokens format."""
442
+ body = f"<{self.obj_type}>"
443
+
444
+ assert DocumentToken.is_known_token(
445
+ body
446
+ ), f"failed DocumentToken.is_known_token({body})"
447
+
448
+ if add_location:
449
+ body += self.get_location_tokens(
450
+ new_line="",
451
+ page_w=page_w,
452
+ page_h=page_h,
453
+ xsize=xsize,
454
+ ysize=ysize,
455
+ add_page_index=add_page_index,
456
+ )
457
+
458
+ if add_content and self.text is not None:
459
+ body += self.text.strip()
460
+
461
+ body += f"</{self.obj_type}>{new_line}"
462
+
463
+ return body
464
+
465
+
466
+ class ListItem(BaseText):
467
+ """List item."""
468
+
469
+ identifier: str
470
+
471
+
472
+ class Ref(AliasModel):
473
+ """Reference."""
474
+
475
+ name: str
476
+ obj_type: str = Field(alias="type")
477
+ ref: str = Field(alias="$ref")
478
+
479
+
480
+ class PageReference(BaseModel):
481
+ """Page reference."""
482
+
483
+ hash: str = Field(json_schema_extra=es_field(type="keyword", ignore_above=8191))
484
+ model: str = Field(json_schema_extra=es_field(suppress=True))
485
+ page: PositiveInt = Field(json_schema_extra=es_field(type="short"))
@@ -8,7 +8,7 @@ from typing import Any
8
8
 
9
9
  from pydantic import BaseModel
10
10
 
11
- from docling_core.types.doc.base import BoundingBox
11
+ from docling_core.types.legacy_doc.base import BoundingBox
12
12
 
13
13
  AnnotationReport = Any # TODO
14
14
 
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Literal
8
8
 
9
9
  from pydantic import BaseModel, Field
10
10
 
11
- from docling_core.types.doc.base import BoundingBox
11
+ from docling_core.types.legacy_doc.base import BoundingBox
12
12
  from docling_core.utils.alias import AliasModel
13
13
 
14
14
  CoordsOrder = Literal["x1", "y1", "x2", "y2"]
@@ -9,7 +9,7 @@ from typing import Any, List, Optional
9
9
  from pydantic import BaseModel, Field
10
10
  from typing_extensions import Annotated
11
11
 
12
- from docling_core.types.doc.base import BoundingBox
12
+ from docling_core.types.legacy_doc.base import BoundingBox
13
13
  from docling_core.utils.alias import AliasModel
14
14
 
15
15
  FontDifferences = dict[str, Any]