docling-core 2.45.0__py3-none-any.whl → 2.47.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/serializer/common.py +1 -0
- docling_core/transforms/serializer/doctags.py +2 -0
- docling_core/transforms/serializer/html.py +16 -10
- docling_core/transforms/serializer/markdown.py +8 -1
- docling_core/types/doc/__init__.py +2 -0
- docling_core/types/doc/document.py +145 -34
- docling_core/utils/legacy.py +1 -1
- {docling_core-2.45.0.dist-info → docling_core-2.47.0.dist-info}/METADATA +1 -1
- {docling_core-2.45.0.dist-info → docling_core-2.47.0.dist-info}/RECORD +13 -13
- {docling_core-2.45.0.dist-info → docling_core-2.47.0.dist-info}/WHEEL +0 -0
- {docling_core-2.45.0.dist-info → docling_core-2.47.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.45.0.dist-info → docling_core-2.47.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.45.0.dist-info → docling_core-2.47.0.dist-info}/top_level.txt +0 -0
|
@@ -157,6 +157,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
157
157
|
item: TableItem,
|
|
158
158
|
doc_serializer: BaseDocSerializer,
|
|
159
159
|
doc: DoclingDocument,
|
|
160
|
+
visited: Optional[set[str]] = None,
|
|
160
161
|
**kwargs: Any,
|
|
161
162
|
) -> SerializationResult:
|
|
162
163
|
"""Serializes the passed item."""
|
|
@@ -179,6 +180,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
179
180
|
add_cell_text=params.add_table_cell_text,
|
|
180
181
|
xsize=params.xsize,
|
|
181
182
|
ysize=params.ysize,
|
|
183
|
+
visited=visited,
|
|
182
184
|
)
|
|
183
185
|
res_parts.append(create_ser_result(text=otsl_text, span_source=item))
|
|
184
186
|
|
|
@@ -65,8 +65,8 @@ from docling_core.types.doc.document import (
|
|
|
65
65
|
PictureItem,
|
|
66
66
|
PictureMoleculeData,
|
|
67
67
|
PictureTabularChartData,
|
|
68
|
+
RichTableCell,
|
|
68
69
|
SectionHeaderItem,
|
|
69
|
-
TableCell,
|
|
70
70
|
TableItem,
|
|
71
71
|
TextItem,
|
|
72
72
|
TitleItem,
|
|
@@ -346,9 +346,6 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
346
346
|
**kwargs: Any,
|
|
347
347
|
) -> SerializationResult:
|
|
348
348
|
"""Serializes the passed table item to HTML."""
|
|
349
|
-
nrows = item.data.num_rows
|
|
350
|
-
ncols = item.data.num_cols
|
|
351
|
-
|
|
352
349
|
res_parts: list[SerializationResult] = []
|
|
353
350
|
cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
|
|
354
351
|
if cap_res.text:
|
|
@@ -356,11 +353,11 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
356
353
|
|
|
357
354
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
358
355
|
body = ""
|
|
356
|
+
span_source: Union[DocItem, list[SerializationResult]] = []
|
|
359
357
|
|
|
360
|
-
for i in
|
|
358
|
+
for i, row in enumerate(item.data.grid):
|
|
361
359
|
body += "<tr>"
|
|
362
|
-
for j in
|
|
363
|
-
cell: TableCell = item.data.grid[i][j]
|
|
360
|
+
for j, cell in enumerate(row):
|
|
364
361
|
|
|
365
362
|
rowspan, rowstart = (
|
|
366
363
|
cell.row_span,
|
|
@@ -376,7 +373,16 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
376
373
|
if colstart != j:
|
|
377
374
|
continue
|
|
378
375
|
|
|
379
|
-
|
|
376
|
+
if isinstance(cell, RichTableCell):
|
|
377
|
+
ser_res = doc_serializer.serialize(
|
|
378
|
+
item=cell.ref.resolve(doc=doc), **kwargs
|
|
379
|
+
)
|
|
380
|
+
content = ser_res.text
|
|
381
|
+
span_source = [ser_res]
|
|
382
|
+
else:
|
|
383
|
+
content = html.escape(cell.text.strip())
|
|
384
|
+
span_source = item
|
|
385
|
+
|
|
380
386
|
celltag = "td"
|
|
381
387
|
if cell.column_header or cell.row_header or cell.row_section:
|
|
382
388
|
celltag = "th"
|
|
@@ -389,14 +395,14 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
389
395
|
|
|
390
396
|
text_dir = get_text_direction(content)
|
|
391
397
|
if text_dir == "rtl":
|
|
392
|
-
opening_tag += f' dir="{
|
|
398
|
+
opening_tag += f' dir="{text_dir}"'
|
|
393
399
|
|
|
394
400
|
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
395
401
|
body += "</tr>"
|
|
396
402
|
|
|
397
403
|
if body:
|
|
398
404
|
body = f"<tbody>{body}</tbody>"
|
|
399
|
-
res_parts.append(create_ser_result(text=body, span_source=
|
|
405
|
+
res_parts.append(create_ser_result(text=body, span_source=span_source))
|
|
400
406
|
|
|
401
407
|
text_res = "".join([r.text for r in res_parts])
|
|
402
408
|
text_res = f"<table>{text_res}</table>" if text_res else ""
|
|
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
|
|
|
55
55
|
PictureItem,
|
|
56
56
|
PictureMoleculeData,
|
|
57
57
|
PictureTabularChartData,
|
|
58
|
+
RichTableCell,
|
|
58
59
|
SectionHeaderItem,
|
|
59
60
|
TableItem,
|
|
60
61
|
TextItem,
|
|
@@ -320,7 +321,13 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
320
321
|
[
|
|
321
322
|
# make sure that md tables are not broken
|
|
322
323
|
# due to newline chars in the text
|
|
323
|
-
|
|
324
|
+
(
|
|
325
|
+
doc_serializer.serialize(
|
|
326
|
+
item=col.ref.resolve(doc=doc), **kwargs
|
|
327
|
+
).text
|
|
328
|
+
if isinstance(col, RichTableCell)
|
|
329
|
+
else col.text
|
|
330
|
+
).replace("\n", " ")
|
|
324
331
|
for col in row
|
|
325
332
|
]
|
|
326
333
|
for row in item.data.grid
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
|
|
9
9
|
from .document import (
|
|
10
|
+
AnyTableCell,
|
|
10
11
|
BaseAnnotation,
|
|
11
12
|
ChartBar,
|
|
12
13
|
ChartLine,
|
|
@@ -52,6 +53,7 @@ from .document import (
|
|
|
52
53
|
PictureTabularChartData,
|
|
53
54
|
ProvenanceItem,
|
|
54
55
|
RefItem,
|
|
56
|
+
RichTableCell,
|
|
55
57
|
Script,
|
|
56
58
|
SectionHeaderItem,
|
|
57
59
|
TableCell,
|
|
@@ -34,7 +34,7 @@ from pydantic import (
|
|
|
34
34
|
validate_call,
|
|
35
35
|
)
|
|
36
36
|
from tabulate import tabulate
|
|
37
|
-
from typing_extensions import Annotated, Self, deprecated
|
|
37
|
+
from typing_extensions import Annotated, Self, deprecated, override
|
|
38
38
|
|
|
39
39
|
from docling_core.search.package import VERSION_PATTERN
|
|
40
40
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
@@ -60,7 +60,7 @@ _logger = logging.getLogger(__name__)
|
|
|
60
60
|
|
|
61
61
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
62
62
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
63
|
-
CURRENT_VERSION: Final = "1.
|
|
63
|
+
CURRENT_VERSION: Final = "1.6.0"
|
|
64
64
|
|
|
65
65
|
DEFAULT_EXPORT_LABELS = {
|
|
66
66
|
DocItemLabel.TITLE,
|
|
@@ -325,7 +325,7 @@ class TableCell(BaseModel):
|
|
|
325
325
|
in data
|
|
326
326
|
):
|
|
327
327
|
return data
|
|
328
|
-
text = data
|
|
328
|
+
text = data.get("bbox", {}).get("token", "")
|
|
329
329
|
if not len(text):
|
|
330
330
|
text_cells = data.pop("text_cell_bboxes", None)
|
|
331
331
|
if text_cells:
|
|
@@ -337,11 +337,39 @@ class TableCell(BaseModel):
|
|
|
337
337
|
|
|
338
338
|
return data
|
|
339
339
|
|
|
340
|
+
def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
|
|
341
|
+
return self.text
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
class RichTableCell(TableCell):
|
|
345
|
+
"""RichTableCell."""
|
|
346
|
+
|
|
347
|
+
ref: "RefItem"
|
|
348
|
+
|
|
349
|
+
@override
|
|
350
|
+
def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
|
|
351
|
+
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
|
|
352
|
+
|
|
353
|
+
if doc is not None:
|
|
354
|
+
doc_serializer = kwargs.pop(
|
|
355
|
+
"doc_serializer", MarkdownDocSerializer(doc=doc)
|
|
356
|
+
)
|
|
357
|
+
ser_res = doc_serializer.serialize(item=self.ref.resolve(doc=doc), **kwargs)
|
|
358
|
+
return ser_res.text
|
|
359
|
+
else:
|
|
360
|
+
return "<!-- rich cell -->"
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
AnyTableCell = Annotated[
|
|
364
|
+
Union[RichTableCell, TableCell],
|
|
365
|
+
Field(union_mode="left_to_right"),
|
|
366
|
+
]
|
|
367
|
+
|
|
340
368
|
|
|
341
369
|
class TableData(BaseModel): # TBD
|
|
342
370
|
"""BaseTableData."""
|
|
343
371
|
|
|
344
|
-
table_cells: List[
|
|
372
|
+
table_cells: List[AnyTableCell] = []
|
|
345
373
|
num_rows: int = 0
|
|
346
374
|
num_cols: int = 0
|
|
347
375
|
|
|
@@ -380,7 +408,9 @@ class TableData(BaseModel): # TBD
|
|
|
380
408
|
|
|
381
409
|
return table_data
|
|
382
410
|
|
|
383
|
-
def remove_rows(
|
|
411
|
+
def remove_rows(
|
|
412
|
+
self, indices: List[int], doc: Optional["DoclingDocument"] = None
|
|
413
|
+
) -> List[List[TableCell]]:
|
|
384
414
|
"""Remove rows from the table by their indices.
|
|
385
415
|
|
|
386
416
|
:param indices: List[int]: A list of indices of the rows to remove. (Starting from 0)
|
|
@@ -392,6 +422,7 @@ class TableData(BaseModel): # TBD
|
|
|
392
422
|
|
|
393
423
|
indices = sorted(indices, reverse=True)
|
|
394
424
|
|
|
425
|
+
refs_to_remove = []
|
|
395
426
|
all_removed_cells = []
|
|
396
427
|
for row_index in indices:
|
|
397
428
|
if row_index < 0 or row_index >= self.num_rows:
|
|
@@ -403,6 +434,10 @@ class TableData(BaseModel): # TBD
|
|
|
403
434
|
end_idx = start_idx + self.num_cols
|
|
404
435
|
removed_cells = self.table_cells[start_idx:end_idx]
|
|
405
436
|
|
|
437
|
+
for cell in removed_cells:
|
|
438
|
+
if isinstance(cell, RichTableCell):
|
|
439
|
+
refs_to_remove.append(cell.ref)
|
|
440
|
+
|
|
406
441
|
# Remove the cells from the table
|
|
407
442
|
self.table_cells = self.table_cells[:start_idx] + self.table_cells[end_idx:]
|
|
408
443
|
|
|
@@ -417,9 +452,18 @@ class TableData(BaseModel): # TBD
|
|
|
417
452
|
|
|
418
453
|
all_removed_cells.append(removed_cells)
|
|
419
454
|
|
|
455
|
+
if refs_to_remove:
|
|
456
|
+
if doc is None:
|
|
457
|
+
_logger.warning(
|
|
458
|
+
"When table contains rich cells, `doc` argument must be provided, "
|
|
459
|
+
"otherwise rich cell content will be left dangling."
|
|
460
|
+
)
|
|
461
|
+
else:
|
|
462
|
+
doc._delete_items(refs_to_remove)
|
|
463
|
+
|
|
420
464
|
return all_removed_cells
|
|
421
465
|
|
|
422
|
-
def pop_row(self) -> List[TableCell]:
|
|
466
|
+
def pop_row(self, doc: Optional["DoclingDocument"] = None) -> List[TableCell]:
|
|
423
467
|
"""Remove and return the last row from the table.
|
|
424
468
|
|
|
425
469
|
:returns: List[TableCell]: A list of TableCell objects representing the popped row.
|
|
@@ -427,16 +471,18 @@ class TableData(BaseModel): # TBD
|
|
|
427
471
|
if self.num_rows == 0:
|
|
428
472
|
raise IndexError("Cannot pop from an empty table.")
|
|
429
473
|
|
|
430
|
-
return self.remove_row(self.num_rows - 1)
|
|
474
|
+
return self.remove_row(self.num_rows - 1, doc=doc)
|
|
431
475
|
|
|
432
|
-
def remove_row(
|
|
476
|
+
def remove_row(
|
|
477
|
+
self, row_index: int, doc: Optional["DoclingDocument"] = None
|
|
478
|
+
) -> List[TableCell]:
|
|
433
479
|
"""Remove a row from the table by its index.
|
|
434
480
|
|
|
435
481
|
:param row_index: int: The index of the row to remove. (Starting from 0)
|
|
436
482
|
|
|
437
483
|
:returns: List[TableCell]: A list of TableCell objects representing the removed row.
|
|
438
484
|
"""
|
|
439
|
-
return self.remove_rows([row_index])[0]
|
|
485
|
+
return self.remove_rows([row_index], doc=doc)[0]
|
|
440
486
|
|
|
441
487
|
def insert_rows(
|
|
442
488
|
self, row_index: int, rows: List[List[str]], after: bool = False
|
|
@@ -1509,8 +1555,15 @@ class TableItem(FloatingItem):
|
|
|
1509
1555
|
|
|
1510
1556
|
annotations: List[TableAnnotationType] = []
|
|
1511
1557
|
|
|
1512
|
-
def export_to_dataframe(
|
|
1558
|
+
def export_to_dataframe(
|
|
1559
|
+
self, doc: Optional["DoclingDocument"] = None
|
|
1560
|
+
) -> pd.DataFrame:
|
|
1513
1561
|
"""Export the table as a Pandas DataFrame."""
|
|
1562
|
+
if doc is None:
|
|
1563
|
+
_logger.warning(
|
|
1564
|
+
"Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated."
|
|
1565
|
+
)
|
|
1566
|
+
|
|
1514
1567
|
if self.data.num_rows == 0 or self.data.num_cols == 0:
|
|
1515
1568
|
return pd.DataFrame()
|
|
1516
1569
|
|
|
@@ -1539,14 +1592,15 @@ class TableItem(FloatingItem):
|
|
|
1539
1592
|
columns = ["" for _ in range(self.data.num_cols)]
|
|
1540
1593
|
for i in range(num_headers):
|
|
1541
1594
|
for j, cell in enumerate(self.data.grid[i]):
|
|
1542
|
-
col_name = cell.
|
|
1595
|
+
col_name = cell._get_text(doc=doc)
|
|
1543
1596
|
if columns[j] != "":
|
|
1544
1597
|
col_name = f".{col_name}"
|
|
1545
1598
|
columns[j] += col_name
|
|
1546
1599
|
|
|
1547
1600
|
# Create table data
|
|
1548
1601
|
table_data = [
|
|
1549
|
-
[cell.
|
|
1602
|
+
[cell._get_text(doc=doc) for cell in row]
|
|
1603
|
+
for row in self.data.grid[num_headers:]
|
|
1550
1604
|
]
|
|
1551
1605
|
|
|
1552
1606
|
# Create DataFrame
|
|
@@ -1577,7 +1631,7 @@ class TableItem(FloatingItem):
|
|
|
1577
1631
|
|
|
1578
1632
|
# make sure that md tables are not broken
|
|
1579
1633
|
# due to newline chars in the text
|
|
1580
|
-
text = col.
|
|
1634
|
+
text = col._get_text(doc=doc)
|
|
1581
1635
|
text = text.replace("\n", " ")
|
|
1582
1636
|
tmp.append(text)
|
|
1583
1637
|
|
|
@@ -1623,6 +1677,7 @@ class TableItem(FloatingItem):
|
|
|
1623
1677
|
add_cell_text: bool = True,
|
|
1624
1678
|
xsize: int = 500,
|
|
1625
1679
|
ysize: int = 500,
|
|
1680
|
+
**kwargs: Any,
|
|
1626
1681
|
) -> str:
|
|
1627
1682
|
"""Export the table as OTSL."""
|
|
1628
1683
|
# Possible OTSL tokens...
|
|
@@ -1639,6 +1694,9 @@ class TableItem(FloatingItem):
|
|
|
1639
1694
|
# Headers (column, row, section row):
|
|
1640
1695
|
# "ched", "rhed", "srow"
|
|
1641
1696
|
|
|
1697
|
+
from docling_core.transforms.serializer.doctags import DocTagsDocSerializer
|
|
1698
|
+
|
|
1699
|
+
doc_serializer = DocTagsDocSerializer(doc=doc)
|
|
1642
1700
|
body = []
|
|
1643
1701
|
nrows = self.data.num_rows
|
|
1644
1702
|
ncols = self.data.num_cols
|
|
@@ -1652,7 +1710,9 @@ class TableItem(FloatingItem):
|
|
|
1652
1710
|
for i in range(nrows):
|
|
1653
1711
|
for j in range(ncols):
|
|
1654
1712
|
cell: TableCell = self.data.grid[i][j]
|
|
1655
|
-
content = cell.
|
|
1713
|
+
content = cell._get_text(
|
|
1714
|
+
doc=doc, doc_serializer=doc_serializer, **kwargs
|
|
1715
|
+
).strip()
|
|
1656
1716
|
rowspan, rowstart = (
|
|
1657
1717
|
cell.row_span,
|
|
1658
1718
|
cell.start_row_offset_idx,
|
|
@@ -2304,6 +2364,15 @@ class DoclingDocument(BaseModel):
|
|
|
2304
2364
|
refs_to_be_deleted=refs_to_be_deleted,
|
|
2305
2365
|
lookup=lookup,
|
|
2306
2366
|
)
|
|
2367
|
+
if isinstance(node, TableItem):
|
|
2368
|
+
for cell in node.data.table_cells:
|
|
2369
|
+
if isinstance(cell, RichTableCell):
|
|
2370
|
+
path = cell.ref._split_ref_to_path()
|
|
2371
|
+
cell.ref = self._update_ref_with_lookup(
|
|
2372
|
+
item_label=path[1],
|
|
2373
|
+
item_index=int(path[2]),
|
|
2374
|
+
lookup=lookup,
|
|
2375
|
+
)
|
|
2307
2376
|
|
|
2308
2377
|
# Update the self_ref reference
|
|
2309
2378
|
if node.parent is not None:
|
|
@@ -3945,16 +4014,22 @@ class DoclingDocument(BaseModel):
|
|
|
3945
4014
|
"""num_pages."""
|
|
3946
4015
|
return len(self.pages.values())
|
|
3947
4016
|
|
|
3948
|
-
def validate_tree(self, root) -> bool:
|
|
4017
|
+
def validate_tree(self, root: NodeItem) -> bool:
|
|
3949
4018
|
"""validate_tree."""
|
|
3950
|
-
res = []
|
|
3951
4019
|
for child_ref in root.children:
|
|
3952
4020
|
child = child_ref.resolve(self)
|
|
3953
|
-
if child.parent.resolve(self) != root:
|
|
4021
|
+
if child.parent.resolve(self) != root or not self.validate_tree(child):
|
|
3954
4022
|
return False
|
|
3955
|
-
res.append(self.validate_tree(child))
|
|
3956
4023
|
|
|
3957
|
-
|
|
4024
|
+
if isinstance(root, TableItem):
|
|
4025
|
+
for cell in root.data.table_cells:
|
|
4026
|
+
if isinstance(cell, RichTableCell) and (
|
|
4027
|
+
(par_ref := cell.ref.resolve(self).parent) is None
|
|
4028
|
+
or par_ref.resolve(self) != root
|
|
4029
|
+
):
|
|
4030
|
+
return False
|
|
4031
|
+
|
|
4032
|
+
return True
|
|
3958
4033
|
|
|
3959
4034
|
def iterate_items(
|
|
3960
4035
|
self,
|
|
@@ -3963,14 +4038,14 @@ class DoclingDocument(BaseModel):
|
|
|
3963
4038
|
traverse_pictures: bool = False,
|
|
3964
4039
|
page_no: Optional[int] = None,
|
|
3965
4040
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
3966
|
-
_level: int = 0, #
|
|
4041
|
+
_level: int = 0, # deprecated
|
|
3967
4042
|
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
3968
4043
|
"""Iterate elements with level."""
|
|
3969
4044
|
for item, stack in self._iterate_items_with_stack(
|
|
3970
4045
|
root=root,
|
|
3971
4046
|
with_groups=with_groups,
|
|
3972
4047
|
traverse_pictures=traverse_pictures,
|
|
3973
|
-
|
|
4048
|
+
page_nrs={page_no} if page_no is not None else None,
|
|
3974
4049
|
included_content_layers=included_content_layers,
|
|
3975
4050
|
):
|
|
3976
4051
|
yield item, len(stack)
|
|
@@ -3980,7 +4055,7 @@ class DoclingDocument(BaseModel):
|
|
|
3980
4055
|
root: Optional[NodeItem] = None,
|
|
3981
4056
|
with_groups: bool = False,
|
|
3982
4057
|
traverse_pictures: bool = False,
|
|
3983
|
-
|
|
4058
|
+
page_nrs: Optional[set[int]] = None,
|
|
3984
4059
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
3985
4060
|
_stack: Optional[list[int]] = None,
|
|
3986
4061
|
) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
|
|
@@ -4003,8 +4078,8 @@ class DoclingDocument(BaseModel):
|
|
|
4003
4078
|
and (
|
|
4004
4079
|
not isinstance(root, DocItem)
|
|
4005
4080
|
or (
|
|
4006
|
-
|
|
4007
|
-
or any(prov.page_no
|
|
4081
|
+
page_nrs is None
|
|
4082
|
+
or any(prov.page_no in page_nrs for prov in root.prov)
|
|
4008
4083
|
)
|
|
4009
4084
|
)
|
|
4010
4085
|
and root.content_layer in my_layers
|
|
@@ -4038,7 +4113,7 @@ class DoclingDocument(BaseModel):
|
|
|
4038
4113
|
child,
|
|
4039
4114
|
with_groups=with_groups,
|
|
4040
4115
|
traverse_pictures=traverse_pictures,
|
|
4041
|
-
|
|
4116
|
+
page_nrs=page_nrs,
|
|
4042
4117
|
_stack=my_stack,
|
|
4043
4118
|
included_content_layers=my_layers,
|
|
4044
4119
|
)
|
|
@@ -5324,7 +5399,9 @@ class DoclingDocument(BaseModel):
|
|
|
5324
5399
|
grid.append([])
|
|
5325
5400
|
for j, cell in enumerate(row):
|
|
5326
5401
|
if j < 10:
|
|
5327
|
-
text = get_text(
|
|
5402
|
+
text = get_text(
|
|
5403
|
+
cell._get_text(doc=self), max_text_len=16
|
|
5404
|
+
)
|
|
5328
5405
|
grid[-1].append(text)
|
|
5329
5406
|
|
|
5330
5407
|
result.append("\n" + tabulate(grid) + "\n")
|
|
@@ -5526,7 +5603,9 @@ class DoclingDocument(BaseModel):
|
|
|
5526
5603
|
def get_item_list(self, key: str) -> list[NodeItem]:
|
|
5527
5604
|
return getattr(self, key)
|
|
5528
5605
|
|
|
5529
|
-
def index(
|
|
5606
|
+
def index(
|
|
5607
|
+
self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None
|
|
5608
|
+
) -> None:
|
|
5530
5609
|
|
|
5531
5610
|
orig_ref_to_new_ref: dict[str, str] = {}
|
|
5532
5611
|
page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
|
|
@@ -5537,10 +5616,11 @@ class DoclingDocument(BaseModel):
|
|
|
5537
5616
|
self._names.append(doc.name)
|
|
5538
5617
|
|
|
5539
5618
|
# collect items in traversal order
|
|
5540
|
-
for item, _ in doc.
|
|
5619
|
+
for item, _ in doc._iterate_items_with_stack(
|
|
5541
5620
|
with_groups=True,
|
|
5542
5621
|
traverse_pictures=True,
|
|
5543
5622
|
included_content_layers={c for c in ContentLayer},
|
|
5623
|
+
page_nrs=page_nrs,
|
|
5544
5624
|
):
|
|
5545
5625
|
key = item.self_ref.split("/")[1]
|
|
5546
5626
|
is_body = key == "body"
|
|
@@ -5588,6 +5668,16 @@ class DoclingDocument(BaseModel):
|
|
|
5588
5668
|
)
|
|
5589
5669
|
break
|
|
5590
5670
|
|
|
5671
|
+
# update rich table cells references:
|
|
5672
|
+
if isinstance(parent_item, TableItem):
|
|
5673
|
+
for cell in parent_item.data.table_cells:
|
|
5674
|
+
if (
|
|
5675
|
+
isinstance(cell, RichTableCell)
|
|
5676
|
+
and cell.ref.cref == item.self_ref
|
|
5677
|
+
):
|
|
5678
|
+
cell.ref.cref = new_cref
|
|
5679
|
+
break
|
|
5680
|
+
|
|
5591
5681
|
elif num_components == 2 and path_components[1] == "body":
|
|
5592
5682
|
parent_item = self._body
|
|
5593
5683
|
else:
|
|
@@ -5599,12 +5689,13 @@ class DoclingDocument(BaseModel):
|
|
|
5599
5689
|
# update pages
|
|
5600
5690
|
new_max_page = None
|
|
5601
5691
|
for page_nr in doc.pages:
|
|
5602
|
-
|
|
5603
|
-
|
|
5604
|
-
|
|
5605
|
-
|
|
5606
|
-
|
|
5607
|
-
new_max_page
|
|
5692
|
+
if page_nrs is None or page_nr in page_nrs:
|
|
5693
|
+
new_page = copy.deepcopy(doc.pages[page_nr])
|
|
5694
|
+
new_page_nr = page_nr + page_delta
|
|
5695
|
+
new_page.page_no = new_page_nr
|
|
5696
|
+
self.pages[new_page_nr] = new_page
|
|
5697
|
+
if new_max_page is None or new_page_nr > new_max_page:
|
|
5698
|
+
new_max_page = new_page_nr
|
|
5608
5699
|
if new_max_page is not None:
|
|
5609
5700
|
self._max_page = new_max_page
|
|
5610
5701
|
|
|
@@ -5628,6 +5719,14 @@ class DoclingDocument(BaseModel):
|
|
|
5628
5719
|
doc_index.index(doc=self)
|
|
5629
5720
|
self._update_from_index(doc_index)
|
|
5630
5721
|
|
|
5722
|
+
def filter(self, page_nrs: Optional[set[int]] = None) -> "DoclingDocument":
|
|
5723
|
+
"""Create a new document based on the provided filter parameters."""
|
|
5724
|
+
doc_index = DoclingDocument._DocIndex()
|
|
5725
|
+
doc_index.index(doc=self, page_nrs=page_nrs)
|
|
5726
|
+
res_doc = DoclingDocument(name=self.name)
|
|
5727
|
+
res_doc._update_from_index(doc_index)
|
|
5728
|
+
return res_doc
|
|
5729
|
+
|
|
5631
5730
|
@classmethod
|
|
5632
5731
|
def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
|
|
5633
5732
|
"""Concatenate multiple documents into a single document."""
|
|
@@ -5676,6 +5775,18 @@ class DoclingDocument(BaseModel):
|
|
|
5676
5775
|
elif isinstance(item, ListItem):
|
|
5677
5776
|
validate_list_item(self, item)
|
|
5678
5777
|
|
|
5778
|
+
def add_table_cell(self, table_item: TableItem, cell: TableCell) -> None:
|
|
5779
|
+
"""Add a table cell to the table."""
|
|
5780
|
+
if isinstance(cell, RichTableCell):
|
|
5781
|
+
item = cell.ref.resolve(doc=self)
|
|
5782
|
+
if isinstance(item, NodeItem) and (
|
|
5783
|
+
(not item.parent) or item.parent.cref != table_item.self_ref
|
|
5784
|
+
):
|
|
5785
|
+
raise ValueError(
|
|
5786
|
+
f"Trying to add cell with another parent {item.parent} to {table_item.self_ref}"
|
|
5787
|
+
)
|
|
5788
|
+
table_item.data.table_cells.append(cell)
|
|
5789
|
+
|
|
5679
5790
|
|
|
5680
5791
|
# deprecated aliases (kept for backwards compatibility):
|
|
5681
5792
|
BasePictureData = BaseAnnotation
|
docling_core/utils/legacy.py
CHANGED
|
@@ -252,7 +252,7 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
|
|
|
252
252
|
|
|
253
253
|
spans = list(_make_spans(cell, item))
|
|
254
254
|
table_data[i][j] = GlmTableCell(
|
|
255
|
-
text=cell.
|
|
255
|
+
text=cell._get_text(doc=doc),
|
|
256
256
|
bbox=(
|
|
257
257
|
cell.bbox.as_tuple()
|
|
258
258
|
if cell.bbox is not None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.47.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -28,11 +28,11 @@ docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZ
|
|
|
28
28
|
docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
|
|
29
29
|
docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
|
|
30
30
|
docling_core/transforms/serializer/base.py,sha256=TI8Epj7gyxdTet9j-Rs4o5U09gfACfAIVoirlschviM,7266
|
|
31
|
-
docling_core/transforms/serializer/common.py,sha256=
|
|
32
|
-
docling_core/transforms/serializer/doctags.py,sha256=
|
|
33
|
-
docling_core/transforms/serializer/html.py,sha256=
|
|
31
|
+
docling_core/transforms/serializer/common.py,sha256=RwfdzZ9FRSHQjKM0vskg1CVqar0Z_ms38arSlLAgITc,19150
|
|
32
|
+
docling_core/transforms/serializer/doctags.py,sha256=VXPjAZPhBur7LaEeuqH9k31TgZWSN32lK8z8rJXzFwY,19935
|
|
33
|
+
docling_core/transforms/serializer/html.py,sha256=GRfRaqFIb4FXRMplB4Agl4fSNa5jsHV7P4tBtFMro9I,38453
|
|
34
34
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
35
|
-
docling_core/transforms/serializer/markdown.py,sha256=
|
|
35
|
+
docling_core/transforms/serializer/markdown.py,sha256=hilGM1yWpbbRTjuEjfBRrhavspD5vFF_6SDvlKx8BrM,24230
|
|
36
36
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
37
37
|
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
38
38
|
docling_core/transforms/visualizer/key_value_visualizer.py,sha256=fp7nFLy4flOSiavdRgg5y1Mu7WVLIDGh1zEHsq8kgVM,8979
|
|
@@ -41,9 +41,9 @@ docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=muqmaxOBao
|
|
|
41
41
|
docling_core/transforms/visualizer/table_visualizer.py,sha256=iJPjk-XQSSCH3oujcjPMz-redAwNNHseZ41lFyd-u3k,8097
|
|
42
42
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
43
43
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
44
|
-
docling_core/types/doc/__init__.py,sha256=
|
|
44
|
+
docling_core/types/doc/__init__.py,sha256=Vsl3oJV3_BLpS7rIwvahhcWOwmEBvj7ZbQzQCCl-IQk,1678
|
|
45
45
|
docling_core/types/doc/base.py,sha256=i98y4IF250adR-8BSS374K90fwfwG-vBfWh14tLC5Cs,15906
|
|
46
|
-
docling_core/types/doc/document.py,sha256=
|
|
46
|
+
docling_core/types/doc/document.py,sha256=jyMcK1oiu8X8juNa9DuI3S1imn4hXwjOS7iTLQ1HykU,202707
|
|
47
47
|
docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
|
|
48
48
|
docling_core/types/doc/page.py,sha256=35h1xdtCM3-AaN8Dim9jDseZIiw-3GxpB-ofF-H2rQQ,41878
|
|
49
49
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
@@ -73,12 +73,12 @@ docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,8
|
|
|
73
73
|
docling_core/utils/file.py,sha256=CSNclJGL2OwLIc8DQFdoLxr22FUc4_UC7zS6pNrFfkQ,6858
|
|
74
74
|
docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
|
|
75
75
|
docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
|
|
76
|
-
docling_core/utils/legacy.py,sha256=
|
|
76
|
+
docling_core/utils/legacy.py,sha256=G7ed8fkBpIO8hG3DKEY83cHsrKJHyvDst_1jSdgBXMI,24406
|
|
77
77
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
78
78
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
81
|
-
docling_core-2.
|
|
82
|
-
docling_core-2.
|
|
83
|
-
docling_core-2.
|
|
84
|
-
docling_core-2.
|
|
79
|
+
docling_core-2.47.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
80
|
+
docling_core-2.47.0.dist-info/METADATA,sha256=jW4Zdx0WwStnLDifSsvYyGLw-5C2IYiEeK4IQRGQi-I,6453
|
|
81
|
+
docling_core-2.47.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
82
|
+
docling_core-2.47.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
83
|
+
docling_core-2.47.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
84
|
+
docling_core-2.47.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|