docling-core 2.44.2__py3-none-any.whl → 2.46.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/serializer/common.py +1 -0
- docling_core/transforms/serializer/doctags.py +2 -0
- docling_core/transforms/serializer/html.py +18 -12
- docling_core/transforms/serializer/markdown.py +8 -1
- docling_core/types/doc/__init__.py +2 -0
- docling_core/types/doc/document.py +285 -257
- docling_core/types/doc/utils.py +197 -1
- docling_core/utils/legacy.py +1 -1
- {docling_core-2.44.2.dist-info → docling_core-2.46.0.dist-info}/METADATA +1 -1
- {docling_core-2.44.2.dist-info → docling_core-2.46.0.dist-info}/RECORD +14 -14
- {docling_core-2.44.2.dist-info → docling_core-2.46.0.dist-info}/WHEEL +0 -0
- {docling_core-2.44.2.dist-info → docling_core-2.46.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.44.2.dist-info → docling_core-2.46.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.44.2.dist-info → docling_core-2.46.0.dist-info}/top_level.txt +0 -0
|
@@ -157,6 +157,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
157
157
|
item: TableItem,
|
|
158
158
|
doc_serializer: BaseDocSerializer,
|
|
159
159
|
doc: DoclingDocument,
|
|
160
|
+
visited: Optional[set[str]] = None,
|
|
160
161
|
**kwargs: Any,
|
|
161
162
|
) -> SerializationResult:
|
|
162
163
|
"""Serializes the passed item."""
|
|
@@ -179,6 +180,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
179
180
|
add_cell_text=params.add_table_cell_text,
|
|
180
181
|
xsize=params.xsize,
|
|
181
182
|
ysize=params.ysize,
|
|
183
|
+
visited=visited,
|
|
182
184
|
)
|
|
183
185
|
res_parts.append(create_ser_result(text=otsl_text, span_source=item))
|
|
184
186
|
|
|
@@ -65,8 +65,8 @@ from docling_core.types.doc.document import (
|
|
|
65
65
|
PictureItem,
|
|
66
66
|
PictureMoleculeData,
|
|
67
67
|
PictureTabularChartData,
|
|
68
|
+
RichTableCell,
|
|
68
69
|
SectionHeaderItem,
|
|
69
|
-
TableCell,
|
|
70
70
|
TableItem,
|
|
71
71
|
TextItem,
|
|
72
72
|
TitleItem,
|
|
@@ -346,9 +346,6 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
346
346
|
**kwargs: Any,
|
|
347
347
|
) -> SerializationResult:
|
|
348
348
|
"""Serializes the passed table item to HTML."""
|
|
349
|
-
nrows = item.data.num_rows
|
|
350
|
-
ncols = item.data.num_cols
|
|
351
|
-
|
|
352
349
|
res_parts: list[SerializationResult] = []
|
|
353
350
|
cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
|
|
354
351
|
if cap_res.text:
|
|
@@ -356,11 +353,11 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
356
353
|
|
|
357
354
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
358
355
|
body = ""
|
|
356
|
+
span_source: Union[DocItem, list[SerializationResult]] = []
|
|
359
357
|
|
|
360
|
-
for i in
|
|
358
|
+
for i, row in enumerate(item.data.grid):
|
|
361
359
|
body += "<tr>"
|
|
362
|
-
for j in
|
|
363
|
-
cell: TableCell = item.data.grid[i][j]
|
|
360
|
+
for j, cell in enumerate(row):
|
|
364
361
|
|
|
365
362
|
rowspan, rowstart = (
|
|
366
363
|
cell.row_span,
|
|
@@ -376,7 +373,16 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
376
373
|
if colstart != j:
|
|
377
374
|
continue
|
|
378
375
|
|
|
379
|
-
|
|
376
|
+
if isinstance(cell, RichTableCell):
|
|
377
|
+
ser_res = doc_serializer.serialize(
|
|
378
|
+
item=cell.ref.resolve(doc=doc), **kwargs
|
|
379
|
+
)
|
|
380
|
+
content = ser_res.text
|
|
381
|
+
span_source = [ser_res]
|
|
382
|
+
else:
|
|
383
|
+
content = html.escape(cell.text.strip())
|
|
384
|
+
span_source = item
|
|
385
|
+
|
|
380
386
|
celltag = "td"
|
|
381
387
|
if cell.column_header or cell.row_header or cell.row_section:
|
|
382
388
|
celltag = "th"
|
|
@@ -389,14 +395,14 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
389
395
|
|
|
390
396
|
text_dir = get_text_direction(content)
|
|
391
397
|
if text_dir == "rtl":
|
|
392
|
-
opening_tag += f' dir="{
|
|
398
|
+
opening_tag += f' dir="{text_dir}"'
|
|
393
399
|
|
|
394
400
|
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
395
401
|
body += "</tr>"
|
|
396
402
|
|
|
397
403
|
if body:
|
|
398
404
|
body = f"<tbody>{body}</tbody>"
|
|
399
|
-
res_parts.append(create_ser_result(text=body, span_source=
|
|
405
|
+
res_parts.append(create_ser_result(text=body, span_source=span_source))
|
|
400
406
|
|
|
401
407
|
text_res = "".join([r.text for r in res_parts])
|
|
402
408
|
text_res = f"<table>{text_res}</table>" if text_res else ""
|
|
@@ -1057,7 +1063,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
1057
1063
|
if self.params.html_head is not None:
|
|
1058
1064
|
return self.params.html_head
|
|
1059
1065
|
|
|
1060
|
-
head_parts = ["<head>", '<meta charset="UTF-8"
|
|
1066
|
+
head_parts = ["<head>", '<meta charset="UTF-8"/>']
|
|
1061
1067
|
|
|
1062
1068
|
# Add metadata if requested
|
|
1063
1069
|
if params.add_document_metadata:
|
|
@@ -1067,7 +1073,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
1067
1073
|
head_parts.append("<title>Docling Document</title>")
|
|
1068
1074
|
|
|
1069
1075
|
head_parts.append(
|
|
1070
|
-
'<meta name="generator" content="Docling HTML Serializer"
|
|
1076
|
+
'<meta name="generator" content="Docling HTML Serializer"/>'
|
|
1071
1077
|
)
|
|
1072
1078
|
|
|
1073
1079
|
# Add default styles or custom CSS
|
|
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
|
|
|
55
55
|
PictureItem,
|
|
56
56
|
PictureMoleculeData,
|
|
57
57
|
PictureTabularChartData,
|
|
58
|
+
RichTableCell,
|
|
58
59
|
SectionHeaderItem,
|
|
59
60
|
TableItem,
|
|
60
61
|
TextItem,
|
|
@@ -320,7 +321,13 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
|
|
320
321
|
[
|
|
321
322
|
# make sure that md tables are not broken
|
|
322
323
|
# due to newline chars in the text
|
|
323
|
-
|
|
324
|
+
(
|
|
325
|
+
doc_serializer.serialize(
|
|
326
|
+
item=col.ref.resolve(doc=doc), **kwargs
|
|
327
|
+
).text
|
|
328
|
+
if isinstance(col, RichTableCell)
|
|
329
|
+
else col.text
|
|
330
|
+
).replace("\n", " ")
|
|
324
331
|
for col in row
|
|
325
332
|
]
|
|
326
333
|
for row in item.data.grid
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
|
|
9
9
|
from .document import (
|
|
10
|
+
AnyTableCell,
|
|
10
11
|
BaseAnnotation,
|
|
11
12
|
ChartBar,
|
|
12
13
|
ChartLine,
|
|
@@ -52,6 +53,7 @@ from .document import (
|
|
|
52
53
|
PictureTabularChartData,
|
|
53
54
|
ProvenanceItem,
|
|
54
55
|
RefItem,
|
|
56
|
+
RichTableCell,
|
|
55
57
|
Script,
|
|
56
58
|
SectionHeaderItem,
|
|
57
59
|
TableCell,
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import copy
|
|
5
5
|
import hashlib
|
|
6
|
-
import itertools
|
|
7
6
|
import json
|
|
8
7
|
import logging
|
|
9
8
|
import mimetypes
|
|
@@ -35,7 +34,7 @@ from pydantic import (
|
|
|
35
34
|
validate_call,
|
|
36
35
|
)
|
|
37
36
|
from tabulate import tabulate
|
|
38
|
-
from typing_extensions import Annotated, Self, deprecated
|
|
37
|
+
from typing_extensions import Annotated, Self, deprecated, override
|
|
39
38
|
|
|
40
39
|
from docling_core.search.package import VERSION_PATTERN
|
|
41
40
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
@@ -54,14 +53,14 @@ from docling_core.types.doc.labels import (
|
|
|
54
53
|
GroupLabel,
|
|
55
54
|
PictureClassificationLabel,
|
|
56
55
|
)
|
|
57
|
-
from docling_core.types.doc.tokens import
|
|
58
|
-
from docling_core.types.doc.utils import relative_path
|
|
56
|
+
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
57
|
+
from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
|
|
59
58
|
|
|
60
59
|
_logger = logging.getLogger(__name__)
|
|
61
60
|
|
|
62
61
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
63
62
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
64
|
-
CURRENT_VERSION: Final = "1.
|
|
63
|
+
CURRENT_VERSION: Final = "1.6.0"
|
|
65
64
|
|
|
66
65
|
DEFAULT_EXPORT_LABELS = {
|
|
67
66
|
DocItemLabel.TITLE,
|
|
@@ -326,7 +325,7 @@ class TableCell(BaseModel):
|
|
|
326
325
|
in data
|
|
327
326
|
):
|
|
328
327
|
return data
|
|
329
|
-
text = data
|
|
328
|
+
text = data.get("bbox", {}).get("token", "")
|
|
330
329
|
if not len(text):
|
|
331
330
|
text_cells = data.pop("text_cell_bboxes", None)
|
|
332
331
|
if text_cells:
|
|
@@ -338,11 +337,39 @@ class TableCell(BaseModel):
|
|
|
338
337
|
|
|
339
338
|
return data
|
|
340
339
|
|
|
340
|
+
def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
|
|
341
|
+
return self.text
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
class RichTableCell(TableCell):
|
|
345
|
+
"""RichTableCell."""
|
|
346
|
+
|
|
347
|
+
ref: "RefItem"
|
|
348
|
+
|
|
349
|
+
@override
|
|
350
|
+
def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
|
|
351
|
+
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
|
|
352
|
+
|
|
353
|
+
if doc is not None:
|
|
354
|
+
doc_serializer = kwargs.pop(
|
|
355
|
+
"doc_serializer", MarkdownDocSerializer(doc=doc)
|
|
356
|
+
)
|
|
357
|
+
ser_res = doc_serializer.serialize(item=self.ref.resolve(doc=doc), **kwargs)
|
|
358
|
+
return ser_res.text
|
|
359
|
+
else:
|
|
360
|
+
return "<!-- rich cell -->"
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
AnyTableCell = Annotated[
|
|
364
|
+
Union[RichTableCell, TableCell],
|
|
365
|
+
Field(union_mode="left_to_right"),
|
|
366
|
+
]
|
|
367
|
+
|
|
341
368
|
|
|
342
369
|
class TableData(BaseModel): # TBD
|
|
343
370
|
"""BaseTableData."""
|
|
344
371
|
|
|
345
|
-
table_cells: List[
|
|
372
|
+
table_cells: List[AnyTableCell] = []
|
|
346
373
|
num_rows: int = 0
|
|
347
374
|
num_cols: int = 0
|
|
348
375
|
|
|
@@ -381,7 +408,9 @@ class TableData(BaseModel): # TBD
|
|
|
381
408
|
|
|
382
409
|
return table_data
|
|
383
410
|
|
|
384
|
-
def remove_rows(
|
|
411
|
+
def remove_rows(
|
|
412
|
+
self, indices: List[int], doc: Optional["DoclingDocument"] = None
|
|
413
|
+
) -> List[List[TableCell]]:
|
|
385
414
|
"""Remove rows from the table by their indices.
|
|
386
415
|
|
|
387
416
|
:param indices: List[int]: A list of indices of the rows to remove. (Starting from 0)
|
|
@@ -393,6 +422,7 @@ class TableData(BaseModel): # TBD
|
|
|
393
422
|
|
|
394
423
|
indices = sorted(indices, reverse=True)
|
|
395
424
|
|
|
425
|
+
refs_to_remove = []
|
|
396
426
|
all_removed_cells = []
|
|
397
427
|
for row_index in indices:
|
|
398
428
|
if row_index < 0 or row_index >= self.num_rows:
|
|
@@ -404,6 +434,10 @@ class TableData(BaseModel): # TBD
|
|
|
404
434
|
end_idx = start_idx + self.num_cols
|
|
405
435
|
removed_cells = self.table_cells[start_idx:end_idx]
|
|
406
436
|
|
|
437
|
+
for cell in removed_cells:
|
|
438
|
+
if isinstance(cell, RichTableCell):
|
|
439
|
+
refs_to_remove.append(cell.ref)
|
|
440
|
+
|
|
407
441
|
# Remove the cells from the table
|
|
408
442
|
self.table_cells = self.table_cells[:start_idx] + self.table_cells[end_idx:]
|
|
409
443
|
|
|
@@ -418,9 +452,18 @@ class TableData(BaseModel): # TBD
|
|
|
418
452
|
|
|
419
453
|
all_removed_cells.append(removed_cells)
|
|
420
454
|
|
|
455
|
+
if refs_to_remove:
|
|
456
|
+
if doc is None:
|
|
457
|
+
_logger.warning(
|
|
458
|
+
"When table contains rich cells, `doc` argument must be provided, "
|
|
459
|
+
"otherwise rich cell content will be left dangling."
|
|
460
|
+
)
|
|
461
|
+
else:
|
|
462
|
+
doc._delete_items(refs_to_remove)
|
|
463
|
+
|
|
421
464
|
return all_removed_cells
|
|
422
465
|
|
|
423
|
-
def pop_row(self) -> List[TableCell]:
|
|
466
|
+
def pop_row(self, doc: Optional["DoclingDocument"] = None) -> List[TableCell]:
|
|
424
467
|
"""Remove and return the last row from the table.
|
|
425
468
|
|
|
426
469
|
:returns: List[TableCell]: A list of TableCell objects representing the popped row.
|
|
@@ -428,16 +471,18 @@ class TableData(BaseModel): # TBD
|
|
|
428
471
|
if self.num_rows == 0:
|
|
429
472
|
raise IndexError("Cannot pop from an empty table.")
|
|
430
473
|
|
|
431
|
-
return self.remove_row(self.num_rows - 1)
|
|
474
|
+
return self.remove_row(self.num_rows - 1, doc=doc)
|
|
432
475
|
|
|
433
|
-
def remove_row(
|
|
476
|
+
def remove_row(
|
|
477
|
+
self, row_index: int, doc: Optional["DoclingDocument"] = None
|
|
478
|
+
) -> List[TableCell]:
|
|
434
479
|
"""Remove a row from the table by its index.
|
|
435
480
|
|
|
436
481
|
:param row_index: int: The index of the row to remove. (Starting from 0)
|
|
437
482
|
|
|
438
483
|
:returns: List[TableCell]: A list of TableCell objects representing the removed row.
|
|
439
484
|
"""
|
|
440
|
-
return self.remove_rows([row_index])[0]
|
|
485
|
+
return self.remove_rows([row_index], doc=doc)[0]
|
|
441
486
|
|
|
442
487
|
def insert_rows(
|
|
443
488
|
self, row_index: int, rows: List[List[str]], after: bool = False
|
|
@@ -1510,8 +1555,15 @@ class TableItem(FloatingItem):
|
|
|
1510
1555
|
|
|
1511
1556
|
annotations: List[TableAnnotationType] = []
|
|
1512
1557
|
|
|
1513
|
-
def export_to_dataframe(
|
|
1558
|
+
def export_to_dataframe(
|
|
1559
|
+
self, doc: Optional["DoclingDocument"] = None
|
|
1560
|
+
) -> pd.DataFrame:
|
|
1514
1561
|
"""Export the table as a Pandas DataFrame."""
|
|
1562
|
+
if doc is None:
|
|
1563
|
+
_logger.warning(
|
|
1564
|
+
"Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated."
|
|
1565
|
+
)
|
|
1566
|
+
|
|
1515
1567
|
if self.data.num_rows == 0 or self.data.num_cols == 0:
|
|
1516
1568
|
return pd.DataFrame()
|
|
1517
1569
|
|
|
@@ -1540,14 +1592,15 @@ class TableItem(FloatingItem):
|
|
|
1540
1592
|
columns = ["" for _ in range(self.data.num_cols)]
|
|
1541
1593
|
for i in range(num_headers):
|
|
1542
1594
|
for j, cell in enumerate(self.data.grid[i]):
|
|
1543
|
-
col_name = cell.
|
|
1595
|
+
col_name = cell._get_text(doc=doc)
|
|
1544
1596
|
if columns[j] != "":
|
|
1545
1597
|
col_name = f".{col_name}"
|
|
1546
1598
|
columns[j] += col_name
|
|
1547
1599
|
|
|
1548
1600
|
# Create table data
|
|
1549
1601
|
table_data = [
|
|
1550
|
-
[cell.
|
|
1602
|
+
[cell._get_text(doc=doc) for cell in row]
|
|
1603
|
+
for row in self.data.grid[num_headers:]
|
|
1551
1604
|
]
|
|
1552
1605
|
|
|
1553
1606
|
# Create DataFrame
|
|
@@ -1578,7 +1631,7 @@ class TableItem(FloatingItem):
|
|
|
1578
1631
|
|
|
1579
1632
|
# make sure that md tables are not broken
|
|
1580
1633
|
# due to newline chars in the text
|
|
1581
|
-
text = col.
|
|
1634
|
+
text = col._get_text(doc=doc)
|
|
1582
1635
|
text = text.replace("\n", " ")
|
|
1583
1636
|
tmp.append(text)
|
|
1584
1637
|
|
|
@@ -1624,6 +1677,7 @@ class TableItem(FloatingItem):
|
|
|
1624
1677
|
add_cell_text: bool = True,
|
|
1625
1678
|
xsize: int = 500,
|
|
1626
1679
|
ysize: int = 500,
|
|
1680
|
+
**kwargs: Any,
|
|
1627
1681
|
) -> str:
|
|
1628
1682
|
"""Export the table as OTSL."""
|
|
1629
1683
|
# Possible OTSL tokens...
|
|
@@ -1640,6 +1694,9 @@ class TableItem(FloatingItem):
|
|
|
1640
1694
|
# Headers (column, row, section row):
|
|
1641
1695
|
# "ched", "rhed", "srow"
|
|
1642
1696
|
|
|
1697
|
+
from docling_core.transforms.serializer.doctags import DocTagsDocSerializer
|
|
1698
|
+
|
|
1699
|
+
doc_serializer = DocTagsDocSerializer(doc=doc)
|
|
1643
1700
|
body = []
|
|
1644
1701
|
nrows = self.data.num_rows
|
|
1645
1702
|
ncols = self.data.num_cols
|
|
@@ -1653,7 +1710,9 @@ class TableItem(FloatingItem):
|
|
|
1653
1710
|
for i in range(nrows):
|
|
1654
1711
|
for j in range(ncols):
|
|
1655
1712
|
cell: TableCell = self.data.grid[i][j]
|
|
1656
|
-
content = cell.
|
|
1713
|
+
content = cell._get_text(
|
|
1714
|
+
doc=doc, doc_serializer=doc_serializer, **kwargs
|
|
1715
|
+
).strip()
|
|
1657
1716
|
rowspan, rowstart = (
|
|
1658
1717
|
cell.row_span,
|
|
1659
1718
|
cell.start_row_offset_idx,
|
|
@@ -2305,6 +2364,15 @@ class DoclingDocument(BaseModel):
|
|
|
2305
2364
|
refs_to_be_deleted=refs_to_be_deleted,
|
|
2306
2365
|
lookup=lookup,
|
|
2307
2366
|
)
|
|
2367
|
+
if isinstance(node, TableItem):
|
|
2368
|
+
for cell in node.data.table_cells:
|
|
2369
|
+
if isinstance(cell, RichTableCell):
|
|
2370
|
+
path = cell.ref._split_ref_to_path()
|
|
2371
|
+
cell.ref = self._update_ref_with_lookup(
|
|
2372
|
+
item_label=path[1],
|
|
2373
|
+
item_index=int(path[2]),
|
|
2374
|
+
lookup=lookup,
|
|
2375
|
+
)
|
|
2308
2376
|
|
|
2309
2377
|
# Update the self_ref reference
|
|
2310
2378
|
if node.parent is not None:
|
|
@@ -3946,16 +4014,22 @@ class DoclingDocument(BaseModel):
|
|
|
3946
4014
|
"""num_pages."""
|
|
3947
4015
|
return len(self.pages.values())
|
|
3948
4016
|
|
|
3949
|
-
def validate_tree(self, root) -> bool:
|
|
4017
|
+
def validate_tree(self, root: NodeItem) -> bool:
|
|
3950
4018
|
"""validate_tree."""
|
|
3951
|
-
res = []
|
|
3952
4019
|
for child_ref in root.children:
|
|
3953
4020
|
child = child_ref.resolve(self)
|
|
3954
|
-
if child.parent.resolve(self) != root:
|
|
4021
|
+
if child.parent.resolve(self) != root or not self.validate_tree(child):
|
|
3955
4022
|
return False
|
|
3956
|
-
res.append(self.validate_tree(child))
|
|
3957
4023
|
|
|
3958
|
-
|
|
4024
|
+
if isinstance(root, TableItem):
|
|
4025
|
+
for cell in root.data.table_cells:
|
|
4026
|
+
if isinstance(cell, RichTableCell) and (
|
|
4027
|
+
(par_ref := cell.ref.resolve(self).parent) is None
|
|
4028
|
+
or par_ref.resolve(self) != root
|
|
4029
|
+
):
|
|
4030
|
+
return False
|
|
4031
|
+
|
|
4032
|
+
return True
|
|
3959
4033
|
|
|
3960
4034
|
def iterate_items(
|
|
3961
4035
|
self,
|
|
@@ -3964,7 +4038,7 @@ class DoclingDocument(BaseModel):
|
|
|
3964
4038
|
traverse_pictures: bool = False,
|
|
3965
4039
|
page_no: Optional[int] = None,
|
|
3966
4040
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
3967
|
-
_level: int = 0, #
|
|
4041
|
+
_level: int = 0, # deprecated
|
|
3968
4042
|
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
3969
4043
|
"""Iterate elements with level."""
|
|
3970
4044
|
for item, stack in self._iterate_items_with_stack(
|
|
@@ -4688,181 +4762,6 @@ class DoclingDocument(BaseModel):
|
|
|
4688
4762
|
bbox = None
|
|
4689
4763
|
return caption_item, bbox
|
|
4690
4764
|
|
|
4691
|
-
def otsl_parse_texts(texts, tokens):
|
|
4692
|
-
split_word = TableToken.OTSL_NL.value
|
|
4693
|
-
# CLEAN tokens from extra tags, only structural OTSL allowed
|
|
4694
|
-
clean_tokens = []
|
|
4695
|
-
for t in tokens:
|
|
4696
|
-
if t in [
|
|
4697
|
-
TableToken.OTSL_ECEL.value,
|
|
4698
|
-
TableToken.OTSL_FCEL.value,
|
|
4699
|
-
TableToken.OTSL_LCEL.value,
|
|
4700
|
-
TableToken.OTSL_UCEL.value,
|
|
4701
|
-
TableToken.OTSL_XCEL.value,
|
|
4702
|
-
TableToken.OTSL_NL.value,
|
|
4703
|
-
TableToken.OTSL_CHED.value,
|
|
4704
|
-
TableToken.OTSL_RHED.value,
|
|
4705
|
-
TableToken.OTSL_SROW.value,
|
|
4706
|
-
]:
|
|
4707
|
-
clean_tokens.append(t)
|
|
4708
|
-
tokens = clean_tokens
|
|
4709
|
-
split_row_tokens = [
|
|
4710
|
-
list(y)
|
|
4711
|
-
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
|
4712
|
-
if not x
|
|
4713
|
-
]
|
|
4714
|
-
|
|
4715
|
-
table_cells = []
|
|
4716
|
-
r_idx = 0
|
|
4717
|
-
c_idx = 0
|
|
4718
|
-
|
|
4719
|
-
def count_right(tokens, c_idx, r_idx, which_tokens):
|
|
4720
|
-
span = 0
|
|
4721
|
-
c_idx_iter = c_idx
|
|
4722
|
-
while tokens[r_idx][c_idx_iter] in which_tokens:
|
|
4723
|
-
c_idx_iter += 1
|
|
4724
|
-
span += 1
|
|
4725
|
-
if c_idx_iter >= len(tokens[r_idx]):
|
|
4726
|
-
return span
|
|
4727
|
-
return span
|
|
4728
|
-
|
|
4729
|
-
def count_down(tokens, c_idx, r_idx, which_tokens):
|
|
4730
|
-
span = 0
|
|
4731
|
-
r_idx_iter = r_idx
|
|
4732
|
-
while tokens[r_idx_iter][c_idx] in which_tokens:
|
|
4733
|
-
r_idx_iter += 1
|
|
4734
|
-
span += 1
|
|
4735
|
-
if r_idx_iter >= len(tokens):
|
|
4736
|
-
return span
|
|
4737
|
-
return span
|
|
4738
|
-
|
|
4739
|
-
for i, text in enumerate(texts):
|
|
4740
|
-
cell_text = ""
|
|
4741
|
-
if text in [
|
|
4742
|
-
TableToken.OTSL_FCEL.value,
|
|
4743
|
-
TableToken.OTSL_ECEL.value,
|
|
4744
|
-
TableToken.OTSL_CHED.value,
|
|
4745
|
-
TableToken.OTSL_RHED.value,
|
|
4746
|
-
TableToken.OTSL_SROW.value,
|
|
4747
|
-
]:
|
|
4748
|
-
row_span = 1
|
|
4749
|
-
col_span = 1
|
|
4750
|
-
right_offset = 1
|
|
4751
|
-
if text != TableToken.OTSL_ECEL.value:
|
|
4752
|
-
cell_text = texts[i + 1]
|
|
4753
|
-
right_offset = 2
|
|
4754
|
-
|
|
4755
|
-
# Check next element(s) for lcel / ucel / xcel,
|
|
4756
|
-
# set properly row_span, col_span
|
|
4757
|
-
next_right_cell = ""
|
|
4758
|
-
if i + right_offset < len(texts):
|
|
4759
|
-
next_right_cell = texts[i + right_offset]
|
|
4760
|
-
|
|
4761
|
-
next_bottom_cell = ""
|
|
4762
|
-
if r_idx + 1 < len(split_row_tokens):
|
|
4763
|
-
if c_idx < len(split_row_tokens[r_idx + 1]):
|
|
4764
|
-
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
|
4765
|
-
|
|
4766
|
-
if next_right_cell in [
|
|
4767
|
-
TableToken.OTSL_LCEL.value,
|
|
4768
|
-
TableToken.OTSL_XCEL.value,
|
|
4769
|
-
]:
|
|
4770
|
-
# we have horisontal spanning cell or 2d spanning cell
|
|
4771
|
-
col_span += count_right(
|
|
4772
|
-
split_row_tokens,
|
|
4773
|
-
c_idx + 1,
|
|
4774
|
-
r_idx,
|
|
4775
|
-
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
|
4776
|
-
)
|
|
4777
|
-
if next_bottom_cell in [
|
|
4778
|
-
TableToken.OTSL_UCEL.value,
|
|
4779
|
-
TableToken.OTSL_XCEL.value,
|
|
4780
|
-
]:
|
|
4781
|
-
# we have a vertical spanning cell or 2d spanning cell
|
|
4782
|
-
row_span += count_down(
|
|
4783
|
-
split_row_tokens,
|
|
4784
|
-
c_idx,
|
|
4785
|
-
r_idx + 1,
|
|
4786
|
-
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
|
4787
|
-
)
|
|
4788
|
-
|
|
4789
|
-
table_cells.append(
|
|
4790
|
-
TableCell(
|
|
4791
|
-
text=cell_text.strip(),
|
|
4792
|
-
row_span=row_span,
|
|
4793
|
-
col_span=col_span,
|
|
4794
|
-
start_row_offset_idx=r_idx,
|
|
4795
|
-
end_row_offset_idx=r_idx + row_span,
|
|
4796
|
-
start_col_offset_idx=c_idx,
|
|
4797
|
-
end_col_offset_idx=c_idx + col_span,
|
|
4798
|
-
)
|
|
4799
|
-
)
|
|
4800
|
-
if text in [
|
|
4801
|
-
TableToken.OTSL_FCEL.value,
|
|
4802
|
-
TableToken.OTSL_ECEL.value,
|
|
4803
|
-
TableToken.OTSL_CHED.value,
|
|
4804
|
-
TableToken.OTSL_RHED.value,
|
|
4805
|
-
TableToken.OTSL_SROW.value,
|
|
4806
|
-
TableToken.OTSL_LCEL.value,
|
|
4807
|
-
TableToken.OTSL_UCEL.value,
|
|
4808
|
-
TableToken.OTSL_XCEL.value,
|
|
4809
|
-
]:
|
|
4810
|
-
c_idx += 1
|
|
4811
|
-
if text == TableToken.OTSL_NL.value:
|
|
4812
|
-
r_idx += 1
|
|
4813
|
-
c_idx = 0
|
|
4814
|
-
return table_cells, split_row_tokens
|
|
4815
|
-
|
|
4816
|
-
def otsl_extract_tokens_and_text(s: str):
|
|
4817
|
-
# Pattern to match anything enclosed by < >
|
|
4818
|
-
# (including the angle brackets themselves)
|
|
4819
|
-
pattern = r"(<[^>]+>)"
|
|
4820
|
-
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
|
4821
|
-
tokens = re.findall(pattern, s)
|
|
4822
|
-
# Remove any tokens that start with "<loc_"
|
|
4823
|
-
tokens = [
|
|
4824
|
-
token
|
|
4825
|
-
for token in tokens
|
|
4826
|
-
if not (
|
|
4827
|
-
token.startswith(rf"<{_LOC_PREFIX}")
|
|
4828
|
-
or token
|
|
4829
|
-
in [
|
|
4830
|
-
rf"<{DocumentToken.OTSL.value}>",
|
|
4831
|
-
rf"</{DocumentToken.OTSL.value}>",
|
|
4832
|
-
]
|
|
4833
|
-
)
|
|
4834
|
-
]
|
|
4835
|
-
# Split the string by those tokens to get the in-between text
|
|
4836
|
-
text_parts = re.split(pattern, s)
|
|
4837
|
-
text_parts = [
|
|
4838
|
-
token
|
|
4839
|
-
for token in text_parts
|
|
4840
|
-
if not (
|
|
4841
|
-
token.startswith(rf"<{_LOC_PREFIX}")
|
|
4842
|
-
or token
|
|
4843
|
-
in [
|
|
4844
|
-
rf"<{DocumentToken.OTSL.value}>",
|
|
4845
|
-
rf"</{DocumentToken.OTSL.value}>",
|
|
4846
|
-
]
|
|
4847
|
-
)
|
|
4848
|
-
]
|
|
4849
|
-
# Remove any empty or purely whitespace strings from text_parts
|
|
4850
|
-
text_parts = [part for part in text_parts if part.strip()]
|
|
4851
|
-
|
|
4852
|
-
return tokens, text_parts
|
|
4853
|
-
|
|
4854
|
-
def parse_table_content(otsl_content: str) -> TableData:
|
|
4855
|
-
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
|
4856
|
-
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
|
4857
|
-
|
|
4858
|
-
return TableData(
|
|
4859
|
-
num_rows=len(split_row_tokens),
|
|
4860
|
-
num_cols=(
|
|
4861
|
-
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
|
4862
|
-
),
|
|
4863
|
-
table_cells=table_cells,
|
|
4864
|
-
)
|
|
4865
|
-
|
|
4866
4765
|
def extract_chart_type(text_chunk: str):
|
|
4867
4766
|
label = None
|
|
4868
4767
|
chart_labels = [
|
|
@@ -5094,7 +4993,7 @@ class DoclingDocument(BaseModel):
|
|
|
5094
4993
|
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.TEXT)
|
|
5095
4994
|
|
|
5096
4995
|
if tag_name == DocumentToken.OTSL.value:
|
|
5097
|
-
table_data =
|
|
4996
|
+
table_data = parse_otsl_table_content(full_chunk)
|
|
5098
4997
|
caption, caption_bbox = extract_caption(full_chunk)
|
|
5099
4998
|
if caption is not None and caption_bbox is not None:
|
|
5100
4999
|
caption.prov.append(
|
|
@@ -5137,7 +5036,7 @@ class DoclingDocument(BaseModel):
|
|
|
5137
5036
|
table_data = None
|
|
5138
5037
|
chart_type = None
|
|
5139
5038
|
if tag_name == DocumentToken.CHART.value:
|
|
5140
|
-
table_data =
|
|
5039
|
+
table_data = parse_otsl_table_content(full_chunk)
|
|
5141
5040
|
chart_type = extract_chart_type(full_chunk)
|
|
5142
5041
|
if image:
|
|
5143
5042
|
if bbox:
|
|
@@ -5500,7 +5399,9 @@ class DoclingDocument(BaseModel):
|
|
|
5500
5399
|
grid.append([])
|
|
5501
5400
|
for j, cell in enumerate(row):
|
|
5502
5401
|
if j < 10:
|
|
5503
|
-
text = get_text(
|
|
5402
|
+
text = get_text(
|
|
5403
|
+
cell._get_text(doc=self), max_text_len=16
|
|
5404
|
+
)
|
|
5504
5405
|
grid[-1].append(text)
|
|
5505
5406
|
|
|
5506
5407
|
result.append("\n" + tabulate(grid) + "\n")
|
|
@@ -5683,69 +5584,196 @@ class DoclingDocument(BaseModel):
|
|
|
5683
5584
|
)
|
|
5684
5585
|
return self
|
|
5685
5586
|
|
|
5587
|
+
class _DocIndex(BaseModel):
|
|
5588
|
+
"""A document merge buffer."""
|
|
5589
|
+
|
|
5590
|
+
groups: list[GroupItem] = []
|
|
5591
|
+
texts: list[TextItem] = []
|
|
5592
|
+
pictures: list[PictureItem] = []
|
|
5593
|
+
tables: list[TableItem] = []
|
|
5594
|
+
key_value_items: list[KeyValueItem] = []
|
|
5595
|
+
form_items: list[FormItem] = []
|
|
5596
|
+
|
|
5597
|
+
pages: dict[int, PageItem] = {}
|
|
5598
|
+
|
|
5599
|
+
_body: Optional[GroupItem] = None
|
|
5600
|
+
_max_page: int = 0
|
|
5601
|
+
_names: list[str] = []
|
|
5602
|
+
|
|
5603
|
+
def get_item_list(self, key: str) -> list[NodeItem]:
|
|
5604
|
+
return getattr(self, key)
|
|
5605
|
+
|
|
5606
|
+
def index(self, doc: "DoclingDocument") -> None:
|
|
5607
|
+
|
|
5608
|
+
orig_ref_to_new_ref: dict[str, str] = {}
|
|
5609
|
+
page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
|
|
5610
|
+
|
|
5611
|
+
if self._body is None:
|
|
5612
|
+
self._body = GroupItem(**doc.body.model_dump(exclude={"children"}))
|
|
5613
|
+
|
|
5614
|
+
self._names.append(doc.name)
|
|
5615
|
+
|
|
5616
|
+
# collect items in traversal order
|
|
5617
|
+
for item, _ in doc.iterate_items(
|
|
5618
|
+
with_groups=True,
|
|
5619
|
+
traverse_pictures=True,
|
|
5620
|
+
included_content_layers={c for c in ContentLayer},
|
|
5621
|
+
):
|
|
5622
|
+
key = item.self_ref.split("/")[1]
|
|
5623
|
+
is_body = key == "body"
|
|
5624
|
+
new_cref = (
|
|
5625
|
+
"#/body" if is_body else f"#/{key}/{len(self.get_item_list(key))}"
|
|
5626
|
+
)
|
|
5627
|
+
# register cref mapping:
|
|
5628
|
+
orig_ref_to_new_ref[item.self_ref] = new_cref
|
|
5629
|
+
|
|
5630
|
+
if not is_body:
|
|
5631
|
+
new_item = copy.deepcopy(item)
|
|
5632
|
+
new_item.children = []
|
|
5633
|
+
|
|
5634
|
+
# put item in the right list
|
|
5635
|
+
self.get_item_list(key).append(new_item)
|
|
5636
|
+
|
|
5637
|
+
# update item's self reference
|
|
5638
|
+
new_item.self_ref = new_cref
|
|
5639
|
+
|
|
5640
|
+
if isinstance(new_item, DocItem):
|
|
5641
|
+
# update page numbers
|
|
5642
|
+
# NOTE other prov sources (e.g. GraphCell) currently not covered
|
|
5643
|
+
for prov in new_item.prov:
|
|
5644
|
+
prov.page_no += page_delta
|
|
5645
|
+
|
|
5646
|
+
if item.parent:
|
|
5647
|
+
# set item's parent
|
|
5648
|
+
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
|
|
5649
|
+
new_item.parent = RefItem(cref=new_parent_cref)
|
|
5650
|
+
|
|
5651
|
+
# add item to parent's children
|
|
5652
|
+
path_components = new_parent_cref.split("/")
|
|
5653
|
+
num_components = len(path_components)
|
|
5654
|
+
if num_components == 3:
|
|
5655
|
+
_, parent_key, parent_index_str = path_components
|
|
5656
|
+
parent_index = int(parent_index_str)
|
|
5657
|
+
parent_item = self.get_item_list(parent_key)[parent_index]
|
|
5658
|
+
|
|
5659
|
+
# update captions field (not possible in iterate_items order):
|
|
5660
|
+
if isinstance(parent_item, FloatingItem):
|
|
5661
|
+
for cap_it, cap in enumerate(parent_item.captions):
|
|
5662
|
+
if cap.cref == item.self_ref:
|
|
5663
|
+
parent_item.captions[cap_it] = RefItem(
|
|
5664
|
+
cref=new_cref
|
|
5665
|
+
)
|
|
5666
|
+
break
|
|
5667
|
+
|
|
5668
|
+
# update rich table cells references:
|
|
5669
|
+
if isinstance(parent_item, TableItem):
|
|
5670
|
+
for cell in parent_item.data.table_cells:
|
|
5671
|
+
if (
|
|
5672
|
+
isinstance(cell, RichTableCell)
|
|
5673
|
+
and cell.ref.cref == item.self_ref
|
|
5674
|
+
):
|
|
5675
|
+
cell.ref.cref = new_cref
|
|
5676
|
+
break
|
|
5677
|
+
|
|
5678
|
+
elif num_components == 2 and path_components[1] == "body":
|
|
5679
|
+
parent_item = self._body
|
|
5680
|
+
else:
|
|
5681
|
+
raise RuntimeError(
|
|
5682
|
+
f"Unsupported ref format: {new_parent_cref}"
|
|
5683
|
+
)
|
|
5684
|
+
parent_item.children.append(RefItem(cref=new_cref))
|
|
5685
|
+
|
|
5686
|
+
# update pages
|
|
5687
|
+
new_max_page = None
|
|
5688
|
+
for page_nr in doc.pages:
|
|
5689
|
+
new_page = copy.deepcopy(doc.pages[page_nr])
|
|
5690
|
+
new_page_nr = page_nr + page_delta
|
|
5691
|
+
new_page.page_no = new_page_nr
|
|
5692
|
+
self.pages[new_page_nr] = new_page
|
|
5693
|
+
if new_max_page is None or new_page_nr > new_max_page:
|
|
5694
|
+
new_max_page = new_page_nr
|
|
5695
|
+
if new_max_page is not None:
|
|
5696
|
+
self._max_page = new_max_page
|
|
5697
|
+
|
|
5698
|
+
def get_name(self) -> str:
|
|
5699
|
+
return " + ".join(self._names)
|
|
5700
|
+
|
|
5701
|
+
def _update_from_index(self, doc_index: "_DocIndex") -> None:
|
|
5702
|
+
if doc_index._body is not None:
|
|
5703
|
+
self.body = doc_index._body
|
|
5704
|
+
self.groups = doc_index.groups
|
|
5705
|
+
self.texts = doc_index.texts
|
|
5706
|
+
self.pictures = doc_index.pictures
|
|
5707
|
+
self.tables = doc_index.tables
|
|
5708
|
+
self.key_value_items = doc_index.key_value_items
|
|
5709
|
+
self.form_items = doc_index.form_items
|
|
5710
|
+
self.pages = doc_index.pages
|
|
5711
|
+
self.name = doc_index.get_name()
|
|
5712
|
+
|
|
5686
5713
|
def _normalize_references(self) -> None:
|
|
5687
|
-
|
|
5688
|
-
|
|
5689
|
-
|
|
5690
|
-
|
|
5691
|
-
|
|
5692
|
-
|
|
5693
|
-
|
|
5694
|
-
|
|
5695
|
-
|
|
5696
|
-
|
|
5697
|
-
|
|
5698
|
-
|
|
5714
|
+
doc_index = DoclingDocument._DocIndex()
|
|
5715
|
+
doc_index.index(doc=self)
|
|
5716
|
+
self._update_from_index(doc_index)
|
|
5717
|
+
|
|
5718
|
+
@classmethod
|
|
5719
|
+
def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
|
|
5720
|
+
"""Concatenate multiple documents into a single document."""
|
|
5721
|
+
doc_index = DoclingDocument._DocIndex()
|
|
5722
|
+
for doc in docs:
|
|
5723
|
+
doc_index.index(doc=doc)
|
|
5724
|
+
|
|
5725
|
+
res_doc = DoclingDocument(name=" + ".join([doc.name for doc in docs]))
|
|
5726
|
+
res_doc._update_from_index(doc_index)
|
|
5727
|
+
return res_doc
|
|
5728
|
+
|
|
5729
|
+
def _validate_rules(self):
|
|
5730
|
+
def validate_list_group(doc: DoclingDocument, item: ListGroup):
|
|
5731
|
+
for ref in item.children:
|
|
5732
|
+
child = ref.resolve(doc)
|
|
5733
|
+
if not isinstance(child, ListItem):
|
|
5734
|
+
raise ValueError(
|
|
5735
|
+
f"ListGroup {item.self_ref} contains non-ListItem {child.self_ref} ({child.label=})"
|
|
5736
|
+
)
|
|
5737
|
+
|
|
5738
|
+
def validate_list_item(doc: DoclingDocument, item: ListItem):
|
|
5739
|
+
if item.parent is None:
|
|
5740
|
+
raise ValueError(f"ListItem {item.self_ref} has no parent")
|
|
5741
|
+
if not isinstance(item.parent.resolve(doc), ListGroup):
|
|
5742
|
+
raise ValueError(
|
|
5743
|
+
f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}"
|
|
5744
|
+
)
|
|
5745
|
+
|
|
5746
|
+
def validate_group(doc: DoclingDocument, item: GroupItem):
|
|
5747
|
+
if (
|
|
5748
|
+
item.parent and not item.children
|
|
5749
|
+
): # tolerate empty body, but not other groups
|
|
5750
|
+
raise ValueError(f"Group {item.self_ref} has no children")
|
|
5699
5751
|
|
|
5700
|
-
# collect items in traversal order
|
|
5701
5752
|
for item, _ in self.iterate_items(
|
|
5702
5753
|
with_groups=True,
|
|
5703
5754
|
traverse_pictures=True,
|
|
5704
5755
|
included_content_layers={c for c in ContentLayer},
|
|
5705
5756
|
):
|
|
5706
|
-
|
|
5707
|
-
|
|
5708
|
-
|
|
5709
|
-
|
|
5710
|
-
|
|
5711
|
-
|
|
5712
|
-
|
|
5713
|
-
|
|
5714
|
-
|
|
5715
|
-
|
|
5716
|
-
|
|
5717
|
-
|
|
5718
|
-
|
|
5719
|
-
|
|
5720
|
-
|
|
5721
|
-
|
|
5722
|
-
|
|
5723
|
-
|
|
5724
|
-
|
|
5725
|
-
|
|
5726
|
-
|
|
5727
|
-
# add item to parent's children
|
|
5728
|
-
path_components = new_parent_cref.split("/")
|
|
5729
|
-
num_components = len(path_components)
|
|
5730
|
-
parent_node: NodeItem
|
|
5731
|
-
if num_components == 3:
|
|
5732
|
-
_, parent_key, parent_index_str = path_components
|
|
5733
|
-
parent_index = int(parent_index_str)
|
|
5734
|
-
parent_node = item_lists[parent_key][parent_index]
|
|
5735
|
-
elif num_components == 2 and path_components[1] == "body":
|
|
5736
|
-
parent_node = new_body
|
|
5737
|
-
else:
|
|
5738
|
-
raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
|
|
5739
|
-
parent_node.children.append(RefItem(cref=new_cref))
|
|
5740
|
-
|
|
5741
|
-
# update document
|
|
5742
|
-
self.groups = item_lists["groups"] # type: ignore
|
|
5743
|
-
self.texts = item_lists["texts"] # type: ignore
|
|
5744
|
-
self.pictures = item_lists["pictures"] # type: ignore
|
|
5745
|
-
self.tables = item_lists["tables"] # type: ignore
|
|
5746
|
-
self.key_value_items = item_lists["key_value_items"] # type: ignore
|
|
5747
|
-
self.form_items = item_lists["form_items"] # type: ignore
|
|
5748
|
-
self.body = new_body
|
|
5757
|
+
if isinstance(item, ListGroup):
|
|
5758
|
+
validate_list_group(self, item)
|
|
5759
|
+
|
|
5760
|
+
elif isinstance(item, GroupItem):
|
|
5761
|
+
validate_group(self, item)
|
|
5762
|
+
|
|
5763
|
+
elif isinstance(item, ListItem):
|
|
5764
|
+
validate_list_item(self, item)
|
|
5765
|
+
|
|
5766
|
+
def add_table_cell(self, table_item: TableItem, cell: TableCell) -> None:
|
|
5767
|
+
"""Add a table cell to the table."""
|
|
5768
|
+
if isinstance(cell, RichTableCell):
|
|
5769
|
+
item = cell.ref.resolve(doc=self)
|
|
5770
|
+
if isinstance(item, NodeItem) and (
|
|
5771
|
+
(not item.parent) or item.parent.cref != table_item.self_ref
|
|
5772
|
+
):
|
|
5773
|
+
raise ValueError(
|
|
5774
|
+
f"Trying to add cell with another parent {item.parent} to {table_item.self_ref}"
|
|
5775
|
+
)
|
|
5776
|
+
table_item.data.table_cells.append(cell)
|
|
5749
5777
|
|
|
5750
5778
|
|
|
5751
5779
|
# deprecated aliases (kept for backwards compatibility):
|
docling_core/types/doc/utils.py
CHANGED
|
@@ -6,9 +6,16 @@
|
|
|
6
6
|
"""Utils for document types."""
|
|
7
7
|
|
|
8
8
|
import html
|
|
9
|
+
import itertools
|
|
10
|
+
import re
|
|
9
11
|
import unicodedata
|
|
10
12
|
from pathlib import Path
|
|
11
|
-
from typing import Optional
|
|
13
|
+
from typing import TYPE_CHECKING, List, Optional, Tuple
|
|
14
|
+
|
|
15
|
+
from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from docling_core.types.doc.document import TableCell, TableData
|
|
12
19
|
|
|
13
20
|
|
|
14
21
|
def relative_path(src: Path, target: Path) -> Path:
|
|
@@ -84,3 +91,192 @@ def get_text_direction(text: str) -> str:
|
|
|
84
91
|
or rtl_chars > len(text) / 2
|
|
85
92
|
else "ltr"
|
|
86
93
|
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def otsl_extract_tokens_and_text(s: str) -> Tuple[List[str], List[str]]:
|
|
97
|
+
"""Extract OTSL tokens and text from an OTSL string."""
|
|
98
|
+
# Pattern to match anything enclosed by < >
|
|
99
|
+
# (including the angle brackets themselves)
|
|
100
|
+
pattern = r"(<[^>]+>)"
|
|
101
|
+
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
|
102
|
+
tokens = re.findall(pattern, s)
|
|
103
|
+
# Remove any tokens that start with "<loc_"
|
|
104
|
+
tokens = [
|
|
105
|
+
token
|
|
106
|
+
for token in tokens
|
|
107
|
+
if not (
|
|
108
|
+
token.startswith(rf"<{_LOC_PREFIX}")
|
|
109
|
+
or token
|
|
110
|
+
in [
|
|
111
|
+
rf"<{DocumentToken.OTSL.value}>",
|
|
112
|
+
rf"</{DocumentToken.OTSL.value}>",
|
|
113
|
+
]
|
|
114
|
+
)
|
|
115
|
+
]
|
|
116
|
+
# Split the string by those tokens to get the in-between text
|
|
117
|
+
text_parts = re.split(pattern, s)
|
|
118
|
+
text_parts = [
|
|
119
|
+
token
|
|
120
|
+
for token in text_parts
|
|
121
|
+
if not (
|
|
122
|
+
token.startswith(rf"<{_LOC_PREFIX}")
|
|
123
|
+
or token
|
|
124
|
+
in [
|
|
125
|
+
rf"<{DocumentToken.OTSL.value}>",
|
|
126
|
+
rf"</{DocumentToken.OTSL.value}>",
|
|
127
|
+
]
|
|
128
|
+
)
|
|
129
|
+
]
|
|
130
|
+
# Remove any empty or purely whitespace strings from text_parts
|
|
131
|
+
text_parts = [part for part in text_parts if part.strip()]
|
|
132
|
+
|
|
133
|
+
return tokens, text_parts
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def otsl_parse_texts(
|
|
137
|
+
texts: List[str], tokens: List[str]
|
|
138
|
+
) -> Tuple[List["TableCell"], List[List[str]]]:
|
|
139
|
+
"""Parse OTSL texts and tokens into table cells."""
|
|
140
|
+
from docling_core.types.doc.document import TableCell
|
|
141
|
+
|
|
142
|
+
split_word = TableToken.OTSL_NL.value
|
|
143
|
+
# CLEAN tokens from extra tags, only structural OTSL allowed
|
|
144
|
+
clean_tokens = []
|
|
145
|
+
for t in tokens:
|
|
146
|
+
if t in [
|
|
147
|
+
TableToken.OTSL_ECEL.value,
|
|
148
|
+
TableToken.OTSL_FCEL.value,
|
|
149
|
+
TableToken.OTSL_LCEL.value,
|
|
150
|
+
TableToken.OTSL_UCEL.value,
|
|
151
|
+
TableToken.OTSL_XCEL.value,
|
|
152
|
+
TableToken.OTSL_NL.value,
|
|
153
|
+
TableToken.OTSL_CHED.value,
|
|
154
|
+
TableToken.OTSL_RHED.value,
|
|
155
|
+
TableToken.OTSL_SROW.value,
|
|
156
|
+
]:
|
|
157
|
+
clean_tokens.append(t)
|
|
158
|
+
tokens = clean_tokens
|
|
159
|
+
split_row_tokens = [
|
|
160
|
+
list(y)
|
|
161
|
+
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
|
162
|
+
if not x
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
table_cells = []
|
|
166
|
+
r_idx = 0
|
|
167
|
+
c_idx = 0
|
|
168
|
+
|
|
169
|
+
def count_right(
|
|
170
|
+
tokens: List[List[str]], c_idx: int, r_idx: int, which_tokens: List[str]
|
|
171
|
+
) -> int:
|
|
172
|
+
span = 0
|
|
173
|
+
c_idx_iter = c_idx
|
|
174
|
+
while tokens[r_idx][c_idx_iter] in which_tokens:
|
|
175
|
+
c_idx_iter += 1
|
|
176
|
+
span += 1
|
|
177
|
+
if c_idx_iter >= len(tokens[r_idx]):
|
|
178
|
+
return span
|
|
179
|
+
return span
|
|
180
|
+
|
|
181
|
+
def count_down(
|
|
182
|
+
tokens: List[List[str]], c_idx: int, r_idx: int, which_tokens: List[str]
|
|
183
|
+
) -> int:
|
|
184
|
+
span = 0
|
|
185
|
+
r_idx_iter = r_idx
|
|
186
|
+
while tokens[r_idx_iter][c_idx] in which_tokens:
|
|
187
|
+
r_idx_iter += 1
|
|
188
|
+
span += 1
|
|
189
|
+
if r_idx_iter >= len(tokens):
|
|
190
|
+
return span
|
|
191
|
+
return span
|
|
192
|
+
|
|
193
|
+
for i, text in enumerate(texts):
|
|
194
|
+
cell_text = ""
|
|
195
|
+
if text in [
|
|
196
|
+
TableToken.OTSL_FCEL.value,
|
|
197
|
+
TableToken.OTSL_ECEL.value,
|
|
198
|
+
TableToken.OTSL_CHED.value,
|
|
199
|
+
TableToken.OTSL_RHED.value,
|
|
200
|
+
TableToken.OTSL_SROW.value,
|
|
201
|
+
]:
|
|
202
|
+
row_span = 1
|
|
203
|
+
col_span = 1
|
|
204
|
+
right_offset = 1
|
|
205
|
+
if text != TableToken.OTSL_ECEL.value:
|
|
206
|
+
cell_text = texts[i + 1]
|
|
207
|
+
right_offset = 2
|
|
208
|
+
|
|
209
|
+
# Check next element(s) for lcel / ucel / xcel,
|
|
210
|
+
# set properly row_span, col_span
|
|
211
|
+
next_right_cell = ""
|
|
212
|
+
if i + right_offset < len(texts):
|
|
213
|
+
next_right_cell = texts[i + right_offset]
|
|
214
|
+
|
|
215
|
+
next_bottom_cell = ""
|
|
216
|
+
if r_idx + 1 < len(split_row_tokens):
|
|
217
|
+
if c_idx < len(split_row_tokens[r_idx + 1]):
|
|
218
|
+
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
|
219
|
+
|
|
220
|
+
if next_right_cell in [
|
|
221
|
+
TableToken.OTSL_LCEL.value,
|
|
222
|
+
TableToken.OTSL_XCEL.value,
|
|
223
|
+
]:
|
|
224
|
+
# we have horizontal spanning cell or 2d spanning cell
|
|
225
|
+
col_span += count_right(
|
|
226
|
+
split_row_tokens,
|
|
227
|
+
c_idx + 1,
|
|
228
|
+
r_idx,
|
|
229
|
+
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
|
230
|
+
)
|
|
231
|
+
if next_bottom_cell in [
|
|
232
|
+
TableToken.OTSL_UCEL.value,
|
|
233
|
+
TableToken.OTSL_XCEL.value,
|
|
234
|
+
]:
|
|
235
|
+
# we have a vertical spanning cell or 2d spanning cell
|
|
236
|
+
row_span += count_down(
|
|
237
|
+
split_row_tokens,
|
|
238
|
+
c_idx,
|
|
239
|
+
r_idx + 1,
|
|
240
|
+
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
table_cells.append(
|
|
244
|
+
TableCell(
|
|
245
|
+
text=cell_text.strip(),
|
|
246
|
+
row_span=row_span,
|
|
247
|
+
col_span=col_span,
|
|
248
|
+
start_row_offset_idx=r_idx,
|
|
249
|
+
end_row_offset_idx=r_idx + row_span,
|
|
250
|
+
start_col_offset_idx=c_idx,
|
|
251
|
+
end_col_offset_idx=c_idx + col_span,
|
|
252
|
+
)
|
|
253
|
+
)
|
|
254
|
+
if text in [
|
|
255
|
+
TableToken.OTSL_FCEL.value,
|
|
256
|
+
TableToken.OTSL_ECEL.value,
|
|
257
|
+
TableToken.OTSL_CHED.value,
|
|
258
|
+
TableToken.OTSL_RHED.value,
|
|
259
|
+
TableToken.OTSL_SROW.value,
|
|
260
|
+
TableToken.OTSL_LCEL.value,
|
|
261
|
+
TableToken.OTSL_UCEL.value,
|
|
262
|
+
TableToken.OTSL_XCEL.value,
|
|
263
|
+
]:
|
|
264
|
+
c_idx += 1
|
|
265
|
+
if text == TableToken.OTSL_NL.value:
|
|
266
|
+
r_idx += 1
|
|
267
|
+
c_idx = 0
|
|
268
|
+
return table_cells, split_row_tokens
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def parse_otsl_table_content(otsl_content: str) -> "TableData":
|
|
272
|
+
"""Parse OTSL content into TableData."""
|
|
273
|
+
from docling_core.types.doc.document import TableData
|
|
274
|
+
|
|
275
|
+
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
|
276
|
+
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
|
277
|
+
|
|
278
|
+
return TableData(
|
|
279
|
+
num_rows=len(split_row_tokens),
|
|
280
|
+
num_cols=(max(len(row) for row in split_row_tokens) if split_row_tokens else 0),
|
|
281
|
+
table_cells=table_cells,
|
|
282
|
+
)
|
docling_core/utils/legacy.py
CHANGED
|
@@ -252,7 +252,7 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
|
|
|
252
252
|
|
|
253
253
|
spans = list(_make_spans(cell, item))
|
|
254
254
|
table_data[i][j] = GlmTableCell(
|
|
255
|
-
text=cell.
|
|
255
|
+
text=cell._get_text(doc=doc),
|
|
256
256
|
bbox=(
|
|
257
257
|
cell.bbox.as_tuple()
|
|
258
258
|
if cell.bbox is not None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.46.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -28,11 +28,11 @@ docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZ
|
|
|
28
28
|
docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
|
|
29
29
|
docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
|
|
30
30
|
docling_core/transforms/serializer/base.py,sha256=TI8Epj7gyxdTet9j-Rs4o5U09gfACfAIVoirlschviM,7266
|
|
31
|
-
docling_core/transforms/serializer/common.py,sha256=
|
|
32
|
-
docling_core/transforms/serializer/doctags.py,sha256=
|
|
33
|
-
docling_core/transforms/serializer/html.py,sha256=
|
|
31
|
+
docling_core/transforms/serializer/common.py,sha256=RwfdzZ9FRSHQjKM0vskg1CVqar0Z_ms38arSlLAgITc,19150
|
|
32
|
+
docling_core/transforms/serializer/doctags.py,sha256=VXPjAZPhBur7LaEeuqH9k31TgZWSN32lK8z8rJXzFwY,19935
|
|
33
|
+
docling_core/transforms/serializer/html.py,sha256=GRfRaqFIb4FXRMplB4Agl4fSNa5jsHV7P4tBtFMro9I,38453
|
|
34
34
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
35
|
-
docling_core/transforms/serializer/markdown.py,sha256=
|
|
35
|
+
docling_core/transforms/serializer/markdown.py,sha256=hilGM1yWpbbRTjuEjfBRrhavspD5vFF_6SDvlKx8BrM,24230
|
|
36
36
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
37
37
|
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
38
38
|
docling_core/transforms/visualizer/key_value_visualizer.py,sha256=fp7nFLy4flOSiavdRgg5y1Mu7WVLIDGh1zEHsq8kgVM,8979
|
|
@@ -41,13 +41,13 @@ docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=muqmaxOBao
|
|
|
41
41
|
docling_core/transforms/visualizer/table_visualizer.py,sha256=iJPjk-XQSSCH3oujcjPMz-redAwNNHseZ41lFyd-u3k,8097
|
|
42
42
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
43
43
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
44
|
-
docling_core/types/doc/__init__.py,sha256=
|
|
44
|
+
docling_core/types/doc/__init__.py,sha256=Vsl3oJV3_BLpS7rIwvahhcWOwmEBvj7ZbQzQCCl-IQk,1678
|
|
45
45
|
docling_core/types/doc/base.py,sha256=i98y4IF250adR-8BSS374K90fwfwG-vBfWh14tLC5Cs,15906
|
|
46
|
-
docling_core/types/doc/document.py,sha256
|
|
46
|
+
docling_core/types/doc/document.py,sha256=Ab-JOc6fkzocXP3PcxPRXJPjLOhOTYo_0571vSr6VXo,202093
|
|
47
47
|
docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
|
|
48
48
|
docling_core/types/doc/page.py,sha256=35h1xdtCM3-AaN8Dim9jDseZIiw-3GxpB-ofF-H2rQQ,41878
|
|
49
49
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
50
|
-
docling_core/types/doc/utils.py,sha256=
|
|
50
|
+
docling_core/types/doc/utils.py,sha256=wKC9SJgS4ZKdoYPAlNuRyncv9RIEewzVCBmwbUmbA6E,9106
|
|
51
51
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
52
52
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
53
53
|
docling_core/types/io/__init__.py,sha256=7QYvFRaDE0AzBg8e7tvsVNlLBbCbAbQ9rP2TU8aXR1k,350
|
|
@@ -73,12 +73,12 @@ docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,8
|
|
|
73
73
|
docling_core/utils/file.py,sha256=CSNclJGL2OwLIc8DQFdoLxr22FUc4_UC7zS6pNrFfkQ,6858
|
|
74
74
|
docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
|
|
75
75
|
docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
|
|
76
|
-
docling_core/utils/legacy.py,sha256=
|
|
76
|
+
docling_core/utils/legacy.py,sha256=G7ed8fkBpIO8hG3DKEY83cHsrKJHyvDst_1jSdgBXMI,24406
|
|
77
77
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
78
78
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
81
|
-
docling_core-2.
|
|
82
|
-
docling_core-2.
|
|
83
|
-
docling_core-2.
|
|
84
|
-
docling_core-2.
|
|
79
|
+
docling_core-2.46.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
80
|
+
docling_core-2.46.0.dist-info/METADATA,sha256=txMHh-7y8N3RiJ_M_HbrsvzRyGPJVXv8UcA6_DpAfok,6453
|
|
81
|
+
docling_core-2.46.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
82
|
+
docling_core-2.46.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
83
|
+
docling_core-2.46.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
84
|
+
docling_core-2.46.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|