docling-core 2.34.2__py3-none-any.whl → 2.35.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/visualizer/layout_visualizer.py +8 -4
- docling_core/transforms/visualizer/table_visualizer.py +135 -0
- {docling_core-2.34.2.dist-info → docling_core-2.35.0.dist-info}/METADATA +1 -1
- {docling_core-2.34.2.dist-info → docling_core-2.35.0.dist-info}/RECORD +8 -7
- {docling_core-2.34.2.dist-info → docling_core-2.35.0.dist-info}/WHEEL +0 -0
- {docling_core-2.34.2.dist-info → docling_core-2.35.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.34.2.dist-info → docling_core-2.35.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.34.2.dist-info → docling_core-2.35.0.dist-info}/top_level.txt +0 -0
|
@@ -119,7 +119,10 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
119
119
|
)
|
|
120
120
|
|
|
121
121
|
def _draw_doc_layout(
|
|
122
|
-
self,
|
|
122
|
+
self,
|
|
123
|
+
doc: DoclingDocument,
|
|
124
|
+
images: Optional[dict[Optional[int], Image]] = None,
|
|
125
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
123
126
|
):
|
|
124
127
|
"""Draw the document clusters and optionaly the reading order."""
|
|
125
128
|
clusters = []
|
|
@@ -128,6 +131,9 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
128
131
|
if images is not None:
|
|
129
132
|
my_images = images
|
|
130
133
|
|
|
134
|
+
if included_content_layers is None:
|
|
135
|
+
included_content_layers = {c for c in ContentLayer}
|
|
136
|
+
|
|
131
137
|
# Initialise `my_images` beforehand: sometimes, you have the
|
|
132
138
|
# page-images but no DocItems!
|
|
133
139
|
for page_nr, page in doc.pages.items():
|
|
@@ -141,9 +147,7 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
141
147
|
prev_image = None
|
|
142
148
|
prev_page_nr = None
|
|
143
149
|
for idx, (elem, _) in enumerate(
|
|
144
|
-
doc.iterate_items(
|
|
145
|
-
included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}
|
|
146
|
-
)
|
|
150
|
+
doc.iterate_items(included_content_layers=included_content_layers)
|
|
147
151
|
):
|
|
148
152
|
if not isinstance(elem, DocItem):
|
|
149
153
|
continue
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Define classes for layout visualization."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from PIL import ImageDraw
|
|
8
|
+
from PIL.Image import Image
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
from typing_extensions import override
|
|
11
|
+
|
|
12
|
+
from docling_core.transforms.visualizer.base import BaseVisualizer
|
|
13
|
+
from docling_core.types.doc.document import ContentLayer, DoclingDocument, TableItem
|
|
14
|
+
|
|
15
|
+
_log = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TableVisualizer(BaseVisualizer):
|
|
19
|
+
"""Table visualizer."""
|
|
20
|
+
|
|
21
|
+
class Params(BaseModel):
|
|
22
|
+
"""Table visualization parameters."""
|
|
23
|
+
|
|
24
|
+
# show_Label: bool = False
|
|
25
|
+
show_cells: bool = True
|
|
26
|
+
# show_rows: bool = False
|
|
27
|
+
# show_cols: bool = False
|
|
28
|
+
|
|
29
|
+
base_visualizer: Optional[BaseVisualizer] = None
|
|
30
|
+
params: Params = Params()
|
|
31
|
+
|
|
32
|
+
def _draw_table_cells(
|
|
33
|
+
self,
|
|
34
|
+
table: TableItem,
|
|
35
|
+
page_image: Image,
|
|
36
|
+
page_height: float,
|
|
37
|
+
scale_x: float,
|
|
38
|
+
scale_y: float,
|
|
39
|
+
):
|
|
40
|
+
"""Draw individual table cells."""
|
|
41
|
+
draw = ImageDraw.Draw(page_image, "RGBA")
|
|
42
|
+
|
|
43
|
+
for cell in table.data.table_cells:
|
|
44
|
+
if cell.bbox is not None:
|
|
45
|
+
|
|
46
|
+
tl_bbox = cell.bbox.to_top_left_origin(page_height=page_height)
|
|
47
|
+
|
|
48
|
+
cell_color = (256, 0, 0, 32) # Transparent black for cells
|
|
49
|
+
|
|
50
|
+
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
|
|
51
|
+
cx0 *= scale_x
|
|
52
|
+
cx1 *= scale_x
|
|
53
|
+
cy0 *= scale_y
|
|
54
|
+
cy1 *= scale_y
|
|
55
|
+
|
|
56
|
+
draw.rectangle(
|
|
57
|
+
[(cx0, cy0), (cx1, cy1)],
|
|
58
|
+
outline=(256, 0, 0, 128),
|
|
59
|
+
fill=cell_color,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def _draw_doc_tables(
|
|
63
|
+
self,
|
|
64
|
+
doc: DoclingDocument,
|
|
65
|
+
images: Optional[dict[Optional[int], Image]] = None,
|
|
66
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
67
|
+
):
|
|
68
|
+
"""Draw the document tables."""
|
|
69
|
+
my_images: dict[Optional[int], Image] = {}
|
|
70
|
+
|
|
71
|
+
if images is not None:
|
|
72
|
+
my_images = images
|
|
73
|
+
|
|
74
|
+
if included_content_layers is None:
|
|
75
|
+
included_content_layers = {c for c in ContentLayer}
|
|
76
|
+
|
|
77
|
+
# Initialise `my_images` beforehand: sometimes, you have the
|
|
78
|
+
# page-images but no DocItems!
|
|
79
|
+
for page_nr, page in doc.pages.items():
|
|
80
|
+
page_image = doc.pages[page_nr].image
|
|
81
|
+
if page_image is None or (pil_img := page_image.pil_image) is None:
|
|
82
|
+
raise RuntimeError("Cannot visualize document without images")
|
|
83
|
+
elif page_nr not in my_images:
|
|
84
|
+
image = deepcopy(pil_img)
|
|
85
|
+
my_images[page_nr] = image
|
|
86
|
+
|
|
87
|
+
for idx, (elem, _) in enumerate(
|
|
88
|
+
doc.iterate_items(included_content_layers=included_content_layers)
|
|
89
|
+
):
|
|
90
|
+
if not isinstance(elem, TableItem):
|
|
91
|
+
continue
|
|
92
|
+
if len(elem.prov) == 0:
|
|
93
|
+
continue # Skip elements without provenances
|
|
94
|
+
|
|
95
|
+
if len(elem.prov) == 1:
|
|
96
|
+
|
|
97
|
+
page_nr = elem.prov[0].page_no
|
|
98
|
+
|
|
99
|
+
if page_nr in my_images:
|
|
100
|
+
image = my_images[page_nr]
|
|
101
|
+
|
|
102
|
+
if self.params.show_cells:
|
|
103
|
+
self._draw_table_cells(
|
|
104
|
+
table=elem,
|
|
105
|
+
page_height=doc.pages[page_nr].size.height,
|
|
106
|
+
page_image=image,
|
|
107
|
+
scale_x=image.width / doc.pages[page_nr].size.width,
|
|
108
|
+
scale_y=image.height / doc.pages[page_nr].size.height,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
else:
|
|
112
|
+
raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
|
|
113
|
+
|
|
114
|
+
else:
|
|
115
|
+
_log.error("Can not yet visualise tables with multiple provenances")
|
|
116
|
+
|
|
117
|
+
return my_images
|
|
118
|
+
|
|
119
|
+
@override
|
|
120
|
+
def get_visualization(
|
|
121
|
+
self,
|
|
122
|
+
*,
|
|
123
|
+
doc: DoclingDocument,
|
|
124
|
+
**kwargs,
|
|
125
|
+
) -> dict[Optional[int], Image]:
|
|
126
|
+
"""Get visualization of the document as images by page."""
|
|
127
|
+
base_images = (
|
|
128
|
+
self.base_visualizer.get_visualization(doc=doc, **kwargs)
|
|
129
|
+
if self.base_visualizer
|
|
130
|
+
else None
|
|
131
|
+
)
|
|
132
|
+
return self._draw_doc_tables(
|
|
133
|
+
doc=doc,
|
|
134
|
+
images=base_images,
|
|
135
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.35.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -34,8 +34,9 @@ docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx9
|
|
|
34
34
|
docling_core/transforms/serializer/markdown.py,sha256=wfMNrjA4wMehWLCejAhEN1eQPRixUO1SyL6ojkKkzZY,20614
|
|
35
35
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
36
36
|
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
37
|
-
docling_core/transforms/visualizer/layout_visualizer.py,sha256=
|
|
37
|
+
docling_core/transforms/visualizer/layout_visualizer.py,sha256=N3SA9sMkg2bEZ_2r52FpwRXcI3EJ2M5P9LYK4Az4jqQ,7968
|
|
38
38
|
docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=-ej5uLriNtr9C7YBHXMg8sZfB9Uc8cSRr1bJ8FVjpY8,5320
|
|
39
|
+
docling_core/transforms/visualizer/table_visualizer.py,sha256=XlLMSROyRW2UtAjKTltcESSs_rdQNKjO3QvO7ET7uc0,4275
|
|
39
40
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
40
41
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
41
42
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
@@ -73,9 +74,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
73
74
|
docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
|
|
74
75
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
75
76
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
76
|
-
docling_core-2.
|
|
77
|
-
docling_core-2.
|
|
78
|
-
docling_core-2.
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
81
|
-
docling_core-2.
|
|
77
|
+
docling_core-2.35.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
78
|
+
docling_core-2.35.0.dist-info/METADATA,sha256=Gube58hbnoDQGoeGmaK-yrMulAuKHiw7lUeGKxzSDsc,6453
|
|
79
|
+
docling_core-2.35.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
80
|
+
docling_core-2.35.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
81
|
+
docling_core-2.35.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
82
|
+
docling_core-2.35.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|