docling-core 2.36.0__py3-none-any.whl → 2.37.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/serializer/html.py +1 -1
- docling_core/transforms/visualizer/table_visualizer.py +109 -4
- docling_core/types/doc/document.py +114 -1
- {docling_core-2.36.0.dist-info → docling_core-2.37.0.dist-info}/METADATA +1 -1
- {docling_core-2.36.0.dist-info → docling_core-2.37.0.dist-info}/RECORD +9 -9
- {docling_core-2.36.0.dist-info → docling_core-2.37.0.dist-info}/WHEEL +0 -0
- {docling_core-2.36.0.dist-info → docling_core-2.37.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.36.0.dist-info → docling_core-2.37.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.36.0.dist-info → docling_core-2.37.0.dist-info}/top_level.txt +0 -0
|
@@ -340,7 +340,7 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
340
340
|
|
|
341
341
|
content = html.escape(cell.text.strip())
|
|
342
342
|
celltag = "td"
|
|
343
|
-
if cell.column_header:
|
|
343
|
+
if cell.column_header or cell.row_header or cell.row_section:
|
|
344
344
|
celltag = "th"
|
|
345
345
|
|
|
346
346
|
opening_tag = f"{celltag}"
|
|
@@ -23,8 +23,23 @@ class TableVisualizer(BaseVisualizer):
|
|
|
23
23
|
|
|
24
24
|
# show_Label: bool = False
|
|
25
25
|
show_cells: bool = True
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
show_rows: bool = False
|
|
27
|
+
show_cols: bool = False
|
|
28
|
+
|
|
29
|
+
cell_color: tuple[int, int, int, int] = (256, 0, 0, 32)
|
|
30
|
+
cell_outline: tuple[int, int, int, int] = (256, 0, 0, 128)
|
|
31
|
+
|
|
32
|
+
row_color: tuple[int, int, int, int] = (256, 0, 0, 32)
|
|
33
|
+
row_outline: tuple[int, int, int, int] = (256, 0, 0, 128)
|
|
34
|
+
|
|
35
|
+
row_header_color: tuple[int, int, int, int] = (0, 256, 0, 32)
|
|
36
|
+
row_header_outline: tuple[int, int, int, int] = (0, 256, 0, 128)
|
|
37
|
+
|
|
38
|
+
col_color: tuple[int, int, int, int] = (0, 256, 0, 32)
|
|
39
|
+
col_outline: tuple[int, int, int, int] = (0, 256, 0, 128)
|
|
40
|
+
|
|
41
|
+
col_header_color: tuple[int, int, int, int] = (0, 0, 256, 32)
|
|
42
|
+
col_header_outline: tuple[int, int, int, int] = (0, 0, 256, 128)
|
|
28
43
|
|
|
29
44
|
base_visualizer: Optional[BaseVisualizer] = None
|
|
30
45
|
params: Params = Params()
|
|
@@ -45,7 +60,21 @@ class TableVisualizer(BaseVisualizer):
|
|
|
45
60
|
|
|
46
61
|
tl_bbox = cell.bbox.to_top_left_origin(page_height=page_height)
|
|
47
62
|
|
|
48
|
-
cell_color =
|
|
63
|
+
cell_color = self.params.cell_color # Transparent black for cells
|
|
64
|
+
cell_outline = self.params.cell_outline
|
|
65
|
+
if cell.column_header:
|
|
66
|
+
cell_color = (
|
|
67
|
+
self.params.col_header_color
|
|
68
|
+
) # Transparent black for cells
|
|
69
|
+
cell_outline = self.params.col_header_outline
|
|
70
|
+
if cell.row_header:
|
|
71
|
+
cell_color = (
|
|
72
|
+
self.params.row_header_color
|
|
73
|
+
) # Transparent black for cells
|
|
74
|
+
cell_outline = self.params.row_header_outline
|
|
75
|
+
if cell.row_section:
|
|
76
|
+
cell_color = self.params.row_header_color
|
|
77
|
+
cell_outline = self.params.row_header_outline
|
|
49
78
|
|
|
50
79
|
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
|
|
51
80
|
cx0 *= scale_x
|
|
@@ -55,10 +84,68 @@ class TableVisualizer(BaseVisualizer):
|
|
|
55
84
|
|
|
56
85
|
draw.rectangle(
|
|
57
86
|
[(cx0, cy0), (cx1, cy1)],
|
|
58
|
-
outline=
|
|
87
|
+
outline=cell_outline,
|
|
59
88
|
fill=cell_color,
|
|
60
89
|
)
|
|
61
90
|
|
|
91
|
+
def _draw_table_rows(
|
|
92
|
+
self,
|
|
93
|
+
table: TableItem,
|
|
94
|
+
page_image: Image,
|
|
95
|
+
page_height: float,
|
|
96
|
+
scale_x: float,
|
|
97
|
+
scale_y: float,
|
|
98
|
+
):
|
|
99
|
+
"""Draw individual table cells."""
|
|
100
|
+
draw = ImageDraw.Draw(page_image, "RGBA")
|
|
101
|
+
|
|
102
|
+
rows = table.data.get_row_bounding_boxes()
|
|
103
|
+
|
|
104
|
+
for rid, bbox in rows.items():
|
|
105
|
+
|
|
106
|
+
tl_bbox = bbox.to_top_left_origin(page_height=page_height)
|
|
107
|
+
|
|
108
|
+
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
|
|
109
|
+
cx0 *= scale_x
|
|
110
|
+
cx1 *= scale_x
|
|
111
|
+
cy0 *= scale_y
|
|
112
|
+
cy1 *= scale_y
|
|
113
|
+
|
|
114
|
+
draw.rectangle(
|
|
115
|
+
[(cx0, cy0), (cx1, cy1)],
|
|
116
|
+
outline=self.params.row_outline,
|
|
117
|
+
fill=self.params.row_color,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def _draw_table_cols(
|
|
121
|
+
self,
|
|
122
|
+
table: TableItem,
|
|
123
|
+
page_image: Image,
|
|
124
|
+
page_height: float,
|
|
125
|
+
scale_x: float,
|
|
126
|
+
scale_y: float,
|
|
127
|
+
):
|
|
128
|
+
"""Draw individual table cells."""
|
|
129
|
+
draw = ImageDraw.Draw(page_image, "RGBA")
|
|
130
|
+
|
|
131
|
+
cols = table.data.get_column_bounding_boxes()
|
|
132
|
+
|
|
133
|
+
for cid, bbox in cols.items():
|
|
134
|
+
|
|
135
|
+
tl_bbox = bbox.to_top_left_origin(page_height=page_height)
|
|
136
|
+
|
|
137
|
+
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
|
|
138
|
+
cx0 *= scale_x
|
|
139
|
+
cx1 *= scale_x
|
|
140
|
+
cy0 *= scale_y
|
|
141
|
+
cy1 *= scale_y
|
|
142
|
+
|
|
143
|
+
draw.rectangle(
|
|
144
|
+
[(cx0, cy0), (cx1, cy1)],
|
|
145
|
+
outline=self.params.col_outline,
|
|
146
|
+
fill=self.params.col_color,
|
|
147
|
+
)
|
|
148
|
+
|
|
62
149
|
def _draw_doc_tables(
|
|
63
150
|
self,
|
|
64
151
|
doc: DoclingDocument,
|
|
@@ -108,6 +195,24 @@ class TableVisualizer(BaseVisualizer):
|
|
|
108
195
|
scale_y=image.height / doc.pages[page_nr].size.height,
|
|
109
196
|
)
|
|
110
197
|
|
|
198
|
+
if self.params.show_rows:
|
|
199
|
+
self._draw_table_rows(
|
|
200
|
+
table=elem,
|
|
201
|
+
page_height=doc.pages[page_nr].size.height,
|
|
202
|
+
page_image=image,
|
|
203
|
+
scale_x=image.width / doc.pages[page_nr].size.width,
|
|
204
|
+
scale_y=image.height / doc.pages[page_nr].size.height,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if self.params.show_cols:
|
|
208
|
+
self._draw_table_cols(
|
|
209
|
+
table=elem,
|
|
210
|
+
page_height=doc.pages[page_nr].size.height,
|
|
211
|
+
page_image=image,
|
|
212
|
+
scale_x=image.width / doc.pages[page_nr].size.width,
|
|
213
|
+
scale_y=image.height / doc.pages[page_nr].size.height,
|
|
214
|
+
)
|
|
215
|
+
|
|
111
216
|
else:
|
|
112
217
|
raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
|
|
113
218
|
|
|
@@ -38,7 +38,7 @@ from typing_extensions import Annotated, Self, deprecated
|
|
|
38
38
|
from docling_core.search.package import VERSION_PATTERN
|
|
39
39
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
40
40
|
from docling_core.types.doc import BoundingBox, Size
|
|
41
|
-
from docling_core.types.doc.base import ImageRefMode
|
|
41
|
+
from docling_core.types.doc.base import CoordOrigin, ImageRefMode
|
|
42
42
|
from docling_core.types.doc.labels import (
|
|
43
43
|
CodeLanguageLabel,
|
|
44
44
|
DocItemLabel,
|
|
@@ -372,6 +372,119 @@ class TableData(BaseModel): # TBD
|
|
|
372
372
|
|
|
373
373
|
return table_data
|
|
374
374
|
|
|
375
|
+
def get_row_bounding_boxes(self) -> dict[int, BoundingBox]:
|
|
376
|
+
"""Get the minimal bounding box for each row in the table.
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
List[Optional[BoundingBox]]: A list where each element is the minimal
|
|
380
|
+
bounding box that encompasses all cells in that row, or None if no
|
|
381
|
+
cells in the row have bounding boxes.
|
|
382
|
+
"""
|
|
383
|
+
coords = []
|
|
384
|
+
for cell in self.table_cells:
|
|
385
|
+
if cell.bbox is not None:
|
|
386
|
+
coords.append(cell.bbox.coord_origin)
|
|
387
|
+
|
|
388
|
+
if len(set(coords)) > 1:
|
|
389
|
+
raise ValueError(
|
|
390
|
+
"All bounding boxes must have the same \
|
|
391
|
+
CoordOrigin to compute their union."
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
row_bboxes: dict[int, BoundingBox] = {}
|
|
395
|
+
|
|
396
|
+
for row_idx in range(self.num_rows):
|
|
397
|
+
row_cells_with_bbox: dict[int, list[BoundingBox]] = {}
|
|
398
|
+
|
|
399
|
+
# Collect all cells in this row that have bounding boxes
|
|
400
|
+
for cell in self.table_cells:
|
|
401
|
+
|
|
402
|
+
if (
|
|
403
|
+
cell.bbox is not None
|
|
404
|
+
and cell.start_row_offset_idx <= row_idx < cell.end_row_offset_idx
|
|
405
|
+
):
|
|
406
|
+
|
|
407
|
+
row_span = cell.end_row_offset_idx - cell.start_row_offset_idx
|
|
408
|
+
if row_span in row_cells_with_bbox:
|
|
409
|
+
row_cells_with_bbox[row_span].append(cell.bbox)
|
|
410
|
+
else:
|
|
411
|
+
row_cells_with_bbox[row_span] = [cell.bbox]
|
|
412
|
+
|
|
413
|
+
# Calculate the enclosing bounding box for this row
|
|
414
|
+
if len(row_cells_with_bbox) > 0:
|
|
415
|
+
min_row_span = min(row_cells_with_bbox.keys())
|
|
416
|
+
row_bbox: BoundingBox = BoundingBox.enclosing_bbox(
|
|
417
|
+
row_cells_with_bbox[min_row_span]
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
for rspan, bboxs in row_cells_with_bbox.items():
|
|
421
|
+
for bbox in bboxs:
|
|
422
|
+
row_bbox.l = min(row_bbox.l, bbox.l)
|
|
423
|
+
row_bbox.r = max(row_bbox.r, bbox.r)
|
|
424
|
+
|
|
425
|
+
row_bboxes[row_idx] = row_bbox
|
|
426
|
+
|
|
427
|
+
return row_bboxes
|
|
428
|
+
|
|
429
|
+
def get_column_bounding_boxes(self) -> dict[int, BoundingBox]:
|
|
430
|
+
"""Get the minimal bounding box for each column in the table.
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
List[Optional[BoundingBox]]: A list where each element is the minimal
|
|
434
|
+
bounding box that encompasses all cells in that column, or None if no
|
|
435
|
+
cells in the column have bounding boxes.
|
|
436
|
+
"""
|
|
437
|
+
coords = []
|
|
438
|
+
for cell in self.table_cells:
|
|
439
|
+
if cell.bbox is not None:
|
|
440
|
+
coords.append(cell.bbox.coord_origin)
|
|
441
|
+
|
|
442
|
+
if len(set(coords)) > 1:
|
|
443
|
+
raise ValueError(
|
|
444
|
+
"All bounding boxes must have the same \
|
|
445
|
+
CoordOrigin to compute their union."
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
col_bboxes: dict[int, BoundingBox] = {}
|
|
449
|
+
|
|
450
|
+
for col_idx in range(self.num_cols):
|
|
451
|
+
col_cells_with_bbox: dict[int, list[BoundingBox]] = {}
|
|
452
|
+
|
|
453
|
+
# Collect all cells in this row that have bounding boxes
|
|
454
|
+
for cell in self.table_cells:
|
|
455
|
+
|
|
456
|
+
if (
|
|
457
|
+
cell.bbox is not None
|
|
458
|
+
and cell.start_col_offset_idx <= col_idx < cell.end_col_offset_idx
|
|
459
|
+
):
|
|
460
|
+
|
|
461
|
+
col_span = cell.end_col_offset_idx - cell.start_col_offset_idx
|
|
462
|
+
if col_span in col_cells_with_bbox:
|
|
463
|
+
col_cells_with_bbox[col_span].append(cell.bbox)
|
|
464
|
+
else:
|
|
465
|
+
col_cells_with_bbox[col_span] = [cell.bbox]
|
|
466
|
+
|
|
467
|
+
# Calculate the enclosing bounding box for this row
|
|
468
|
+
if len(col_cells_with_bbox) > 0:
|
|
469
|
+
min_col_span = min(col_cells_with_bbox.keys())
|
|
470
|
+
col_bbox: BoundingBox = BoundingBox.enclosing_bbox(
|
|
471
|
+
col_cells_with_bbox[min_col_span]
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
for rspan, bboxs in col_cells_with_bbox.items():
|
|
475
|
+
for bbox in bboxs:
|
|
476
|
+
if bbox.coord_origin == CoordOrigin.TOPLEFT:
|
|
477
|
+
col_bbox.b = max(col_bbox.b, bbox.b)
|
|
478
|
+
col_bbox.t = min(col_bbox.t, bbox.t)
|
|
479
|
+
|
|
480
|
+
elif bbox.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
481
|
+
col_bbox.b = min(col_bbox.b, bbox.b)
|
|
482
|
+
col_bbox.t = max(col_bbox.t, bbox.t)
|
|
483
|
+
|
|
484
|
+
col_bboxes[col_idx] = col_bbox
|
|
485
|
+
|
|
486
|
+
return col_bboxes
|
|
487
|
+
|
|
375
488
|
|
|
376
489
|
class PictureTabularChartData(PictureChartData):
|
|
377
490
|
"""Base class for picture chart data.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.37.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -29,19 +29,19 @@ docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3
|
|
|
29
29
|
docling_core/transforms/serializer/base.py,sha256=ZFIiZeplL-QbBs9EDUb1awqxapQ23PsApVetJtAs7Vs,6891
|
|
30
30
|
docling_core/transforms/serializer/common.py,sha256=WP-qO-woidrKyvZ56m0vlKMysoLrMzzZtHSCIwsl3ek,19119
|
|
31
31
|
docling_core/transforms/serializer/doctags.py,sha256=PuAExlP-2HxcDSP_R_phtYQU0yKBW94RrPgb85IUxck,19905
|
|
32
|
-
docling_core/transforms/serializer/html.py,sha256=
|
|
32
|
+
docling_core/transforms/serializer/html.py,sha256=SZgQa0QnknEoRwMFLdgmVsLQqLF2rQl3D7XyEZzUHCE,37151
|
|
33
33
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
34
34
|
docling_core/transforms/serializer/markdown.py,sha256=wfMNrjA4wMehWLCejAhEN1eQPRixUO1SyL6ojkKkzZY,20614
|
|
35
35
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
36
36
|
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
37
37
|
docling_core/transforms/visualizer/layout_visualizer.py,sha256=hpq7OnyBgGxt3iW3_aNy9KH_0kmKdgoiJIFPcA2SSHU,8040
|
|
38
38
|
docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=yBra_W33bb16BxrTqP-ABu5NfRplTEJgu3dKdew3zKA,5601
|
|
39
|
-
docling_core/transforms/visualizer/table_visualizer.py,sha256=
|
|
39
|
+
docling_core/transforms/visualizer/table_visualizer.py,sha256=iJPjk-XQSSCH3oujcjPMz-redAwNNHseZ41lFyd-u3k,8097
|
|
40
40
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
41
41
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
42
42
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
43
43
|
docling_core/types/doc/base.py,sha256=ndXquBrOKTFQApIJ5s2-zstj3xlVKRbJDSId0KOQnUg,14817
|
|
44
|
-
docling_core/types/doc/document.py,sha256=
|
|
44
|
+
docling_core/types/doc/document.py,sha256=JIrCXTeTYSbjTM1wt6kAbXF6QZ1OepC9vG2C3rO0j8I,153808
|
|
45
45
|
docling_core/types/doc/labels.py,sha256=JiciRK7_DOkebsrfQ6PVCvS__TsKgWn1ANk84BeB14k,7359
|
|
46
46
|
docling_core/types/doc/page.py,sha256=1JMPwglaTITBvg959L_pcWPb-fXoDYGh-e_tGZMzVMQ,41060
|
|
47
47
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
@@ -74,9 +74,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
74
74
|
docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
|
|
75
75
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
76
76
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
77
|
-
docling_core-2.
|
|
78
|
-
docling_core-2.
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
81
|
-
docling_core-2.
|
|
82
|
-
docling_core-2.
|
|
77
|
+
docling_core-2.37.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
78
|
+
docling_core-2.37.0.dist-info/METADATA,sha256=B0hyQog06wYqrKsB2jbeiAZ-Rk3Pl_uy2JH7Rws-9EY,6453
|
|
79
|
+
docling_core-2.37.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
80
|
+
docling_core-2.37.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
81
|
+
docling_core-2.37.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
82
|
+
docling_core-2.37.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|