docling-core 2.36.0__py3-none-any.whl → 2.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/hybrid_chunker.py +6 -3
- docling_core/transforms/serializer/html.py +1 -1
- docling_core/transforms/visualizer/layout_visualizer.py +2 -2
- docling_core/transforms/visualizer/reading_order_visualizer.py +66 -5
- docling_core/transforms/visualizer/table_visualizer.py +109 -4
- docling_core/types/doc/__init__.py +53 -1
- docling_core/types/doc/document.py +187 -2
- {docling_core-2.36.0.dist-info → docling_core-2.38.0.dist-info}/METADATA +1 -1
- {docling_core-2.36.0.dist-info → docling_core-2.38.0.dist-info}/RECORD +13 -13
- {docling_core-2.36.0.dist-info → docling_core-2.38.0.dist-info}/WHEEL +0 -0
- {docling_core-2.36.0.dist-info → docling_core-2.38.0.dist-info}/entry_points.txt +0 -0
- {docling_core-2.36.0.dist-info → docling_core-2.38.0.dist-info}/licenses/LICENSE +0 -0
- {docling_core-2.36.0.dist-info → docling_core-2.38.0.dist-info}/top_level.txt +0 -0
|
@@ -234,10 +234,13 @@ class HybridChunker(BaseChunker):
|
|
|
234
234
|
if available_length <= 0:
|
|
235
235
|
warnings.warn(
|
|
236
236
|
"Headers and captions for this chunk are longer than the total "
|
|
237
|
-
"
|
|
238
|
-
f"{doc_chunk.text=}"
|
|
237
|
+
"available size for the chunk, so they will be ignored: "
|
|
238
|
+
f"{doc_chunk.text=}, {doc_chunk.meta=}"
|
|
239
239
|
)
|
|
240
|
-
|
|
240
|
+
new_chunk = DocChunk(**doc_chunk.export_json_dict())
|
|
241
|
+
new_chunk.meta.captions = None
|
|
242
|
+
new_chunk.meta.headings = None
|
|
243
|
+
return self._split_using_plain_text(doc_chunk=new_chunk)
|
|
241
244
|
text = doc_chunk.text
|
|
242
245
|
segments = sem_chunker.chunk(text)
|
|
243
246
|
chunks = [DocChunk(text=s, meta=doc_chunk.meta) for s in segments]
|
|
@@ -340,7 +340,7 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
340
340
|
|
|
341
341
|
content = html.escape(cell.text.strip())
|
|
342
342
|
celltag = "td"
|
|
343
|
-
if cell.column_header:
|
|
343
|
+
if cell.column_header or cell.row_header or cell.row_section:
|
|
344
344
|
celltag = "th"
|
|
345
345
|
|
|
346
346
|
opening_tag = f"{celltag}"
|
|
@@ -163,8 +163,8 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
163
163
|
else:
|
|
164
164
|
raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
|
|
165
165
|
|
|
166
|
-
if prev_page_nr is None or page_nr
|
|
167
|
-
#
|
|
166
|
+
if prev_page_nr is None or page_nr != prev_page_nr: # changing page
|
|
167
|
+
# dump previous drawing
|
|
168
168
|
if prev_page_nr is not None and prev_image and clusters:
|
|
169
169
|
self._draw_clusters(
|
|
170
170
|
image=prev_image,
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
"""Define classes for reading order visualization."""
|
|
2
2
|
|
|
3
3
|
from copy import deepcopy
|
|
4
|
-
from typing import Optional
|
|
4
|
+
from typing import Optional, Union
|
|
5
5
|
|
|
6
|
-
from PIL import ImageDraw
|
|
6
|
+
from PIL import ImageDraw, ImageFont
|
|
7
7
|
from PIL.Image import Image
|
|
8
|
+
from PIL.ImageFont import FreeTypeFont
|
|
8
9
|
from pydantic import BaseModel
|
|
9
10
|
from typing_extensions import override
|
|
10
11
|
|
|
@@ -12,6 +13,11 @@ from docling_core.transforms.visualizer.base import BaseVisualizer
|
|
|
12
13
|
from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
|
|
13
14
|
|
|
14
15
|
|
|
16
|
+
class _NumberDrawingData(BaseModel):
|
|
17
|
+
xy: tuple[float, float]
|
|
18
|
+
text: str
|
|
19
|
+
|
|
20
|
+
|
|
15
21
|
class ReadingOrderVisualizer(BaseVisualizer):
|
|
16
22
|
"""Reading order visualizer."""
|
|
17
23
|
|
|
@@ -19,6 +25,7 @@ class ReadingOrderVisualizer(BaseVisualizer):
|
|
|
19
25
|
"""Layout visualization parameters."""
|
|
20
26
|
|
|
21
27
|
show_label: bool = True
|
|
28
|
+
show_branch_numbering: bool = False
|
|
22
29
|
content_layers: set[ContentLayer] = {
|
|
23
30
|
cl for cl in ContentLayer if cl != ContentLayer.BACKGROUND
|
|
24
31
|
}
|
|
@@ -76,10 +83,17 @@ class ReadingOrderVisualizer(BaseVisualizer):
|
|
|
76
83
|
images: Optional[dict[Optional[int], Image]] = None,
|
|
77
84
|
):
|
|
78
85
|
"""Draw the reading order."""
|
|
79
|
-
|
|
86
|
+
font: Union[ImageFont.ImageFont, FreeTypeFont]
|
|
87
|
+
try:
|
|
88
|
+
font = ImageFont.truetype("arial.ttf", 12)
|
|
89
|
+
except OSError:
|
|
90
|
+
# Fallback to default font if arial is not available
|
|
91
|
+
font = ImageFont.load_default()
|
|
80
92
|
x0, y0 = None, None
|
|
93
|
+
number_data_to_draw: dict[Optional[int], list[_NumberDrawingData]] = {}
|
|
81
94
|
my_images: dict[Optional[int], Image] = images or {}
|
|
82
95
|
prev_page = None
|
|
96
|
+
i = 0
|
|
83
97
|
for elem, _ in doc.iterate_items(
|
|
84
98
|
included_content_layers=self.params.content_layers,
|
|
85
99
|
):
|
|
@@ -92,7 +106,10 @@ class ReadingOrderVisualizer(BaseVisualizer):
|
|
|
92
106
|
page_no = prov.page_no
|
|
93
107
|
image = my_images.get(page_no)
|
|
94
108
|
|
|
95
|
-
if
|
|
109
|
+
if page_no not in number_data_to_draw:
|
|
110
|
+
number_data_to_draw[page_no] = []
|
|
111
|
+
|
|
112
|
+
if image is None or prev_page is None or page_no != prev_page:
|
|
96
113
|
# new page begins
|
|
97
114
|
prev_page = page_no
|
|
98
115
|
x0 = y0 = None
|
|
@@ -109,7 +126,7 @@ class ReadingOrderVisualizer(BaseVisualizer):
|
|
|
109
126
|
else:
|
|
110
127
|
image = deepcopy(pil_img)
|
|
111
128
|
my_images[page_no] = image
|
|
112
|
-
draw = ImageDraw.Draw(image)
|
|
129
|
+
draw = ImageDraw.Draw(image, "RGBA")
|
|
113
130
|
|
|
114
131
|
tlo_bbox = prov.bbox.to_top_left_origin(
|
|
115
132
|
page_height=doc.pages[prov.page_no].size.height
|
|
@@ -124,9 +141,20 @@ class ReadingOrderVisualizer(BaseVisualizer):
|
|
|
124
141
|
ro_bbox.b, ro_bbox.t = ro_bbox.t, ro_bbox.b
|
|
125
142
|
|
|
126
143
|
if x0 is None and y0 is None:
|
|
144
|
+
# is_root= True
|
|
127
145
|
x0 = (ro_bbox.l + ro_bbox.r) / 2.0
|
|
128
146
|
y0 = (ro_bbox.b + ro_bbox.t) / 2.0
|
|
147
|
+
|
|
148
|
+
number_data_to_draw[page_no].append(
|
|
149
|
+
_NumberDrawingData(
|
|
150
|
+
xy=(x0, y0),
|
|
151
|
+
text=f"{i}",
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
i += 1
|
|
155
|
+
|
|
129
156
|
else:
|
|
157
|
+
# is_root = False
|
|
130
158
|
assert x0 is not None
|
|
131
159
|
assert y0 is not None
|
|
132
160
|
|
|
@@ -139,7 +167,40 @@ class ReadingOrderVisualizer(BaseVisualizer):
|
|
|
139
167
|
line_width=2,
|
|
140
168
|
color="red",
|
|
141
169
|
)
|
|
170
|
+
|
|
142
171
|
x0, y0 = x1, y1
|
|
172
|
+
|
|
173
|
+
if self.params.show_branch_numbering:
|
|
174
|
+
# post-drawing the numbers to ensure they are rendered on top-layer
|
|
175
|
+
for page in number_data_to_draw:
|
|
176
|
+
if (image := my_images.get(page)) is None:
|
|
177
|
+
continue
|
|
178
|
+
draw = ImageDraw.Draw(image, "RGBA")
|
|
179
|
+
|
|
180
|
+
for num_item in number_data_to_draw[page]:
|
|
181
|
+
|
|
182
|
+
text_bbox = draw.textbbox(num_item.xy, num_item.text, font)
|
|
183
|
+
text_bg_padding = 5
|
|
184
|
+
draw.ellipse(
|
|
185
|
+
[
|
|
186
|
+
(
|
|
187
|
+
text_bbox[0] - text_bg_padding,
|
|
188
|
+
text_bbox[1] - text_bg_padding,
|
|
189
|
+
),
|
|
190
|
+
(
|
|
191
|
+
text_bbox[2] + text_bg_padding,
|
|
192
|
+
text_bbox[3] + text_bg_padding,
|
|
193
|
+
),
|
|
194
|
+
],
|
|
195
|
+
fill="orange",
|
|
196
|
+
)
|
|
197
|
+
draw.text(
|
|
198
|
+
num_item.xy,
|
|
199
|
+
text=num_item.text,
|
|
200
|
+
fill="black",
|
|
201
|
+
font=font,
|
|
202
|
+
)
|
|
203
|
+
|
|
143
204
|
return my_images
|
|
144
205
|
|
|
145
206
|
@override
|
|
@@ -23,8 +23,23 @@ class TableVisualizer(BaseVisualizer):
|
|
|
23
23
|
|
|
24
24
|
# show_Label: bool = False
|
|
25
25
|
show_cells: bool = True
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
show_rows: bool = False
|
|
27
|
+
show_cols: bool = False
|
|
28
|
+
|
|
29
|
+
cell_color: tuple[int, int, int, int] = (256, 0, 0, 32)
|
|
30
|
+
cell_outline: tuple[int, int, int, int] = (256, 0, 0, 128)
|
|
31
|
+
|
|
32
|
+
row_color: tuple[int, int, int, int] = (256, 0, 0, 32)
|
|
33
|
+
row_outline: tuple[int, int, int, int] = (256, 0, 0, 128)
|
|
34
|
+
|
|
35
|
+
row_header_color: tuple[int, int, int, int] = (0, 256, 0, 32)
|
|
36
|
+
row_header_outline: tuple[int, int, int, int] = (0, 256, 0, 128)
|
|
37
|
+
|
|
38
|
+
col_color: tuple[int, int, int, int] = (0, 256, 0, 32)
|
|
39
|
+
col_outline: tuple[int, int, int, int] = (0, 256, 0, 128)
|
|
40
|
+
|
|
41
|
+
col_header_color: tuple[int, int, int, int] = (0, 0, 256, 32)
|
|
42
|
+
col_header_outline: tuple[int, int, int, int] = (0, 0, 256, 128)
|
|
28
43
|
|
|
29
44
|
base_visualizer: Optional[BaseVisualizer] = None
|
|
30
45
|
params: Params = Params()
|
|
@@ -45,7 +60,21 @@ class TableVisualizer(BaseVisualizer):
|
|
|
45
60
|
|
|
46
61
|
tl_bbox = cell.bbox.to_top_left_origin(page_height=page_height)
|
|
47
62
|
|
|
48
|
-
cell_color =
|
|
63
|
+
cell_color = self.params.cell_color # Transparent black for cells
|
|
64
|
+
cell_outline = self.params.cell_outline
|
|
65
|
+
if cell.column_header:
|
|
66
|
+
cell_color = (
|
|
67
|
+
self.params.col_header_color
|
|
68
|
+
) # Transparent black for cells
|
|
69
|
+
cell_outline = self.params.col_header_outline
|
|
70
|
+
if cell.row_header:
|
|
71
|
+
cell_color = (
|
|
72
|
+
self.params.row_header_color
|
|
73
|
+
) # Transparent black for cells
|
|
74
|
+
cell_outline = self.params.row_header_outline
|
|
75
|
+
if cell.row_section:
|
|
76
|
+
cell_color = self.params.row_header_color
|
|
77
|
+
cell_outline = self.params.row_header_outline
|
|
49
78
|
|
|
50
79
|
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
|
|
51
80
|
cx0 *= scale_x
|
|
@@ -55,10 +84,68 @@ class TableVisualizer(BaseVisualizer):
|
|
|
55
84
|
|
|
56
85
|
draw.rectangle(
|
|
57
86
|
[(cx0, cy0), (cx1, cy1)],
|
|
58
|
-
outline=
|
|
87
|
+
outline=cell_outline,
|
|
59
88
|
fill=cell_color,
|
|
60
89
|
)
|
|
61
90
|
|
|
91
|
+
def _draw_table_rows(
|
|
92
|
+
self,
|
|
93
|
+
table: TableItem,
|
|
94
|
+
page_image: Image,
|
|
95
|
+
page_height: float,
|
|
96
|
+
scale_x: float,
|
|
97
|
+
scale_y: float,
|
|
98
|
+
):
|
|
99
|
+
"""Draw individual table cells."""
|
|
100
|
+
draw = ImageDraw.Draw(page_image, "RGBA")
|
|
101
|
+
|
|
102
|
+
rows = table.data.get_row_bounding_boxes()
|
|
103
|
+
|
|
104
|
+
for rid, bbox in rows.items():
|
|
105
|
+
|
|
106
|
+
tl_bbox = bbox.to_top_left_origin(page_height=page_height)
|
|
107
|
+
|
|
108
|
+
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
|
|
109
|
+
cx0 *= scale_x
|
|
110
|
+
cx1 *= scale_x
|
|
111
|
+
cy0 *= scale_y
|
|
112
|
+
cy1 *= scale_y
|
|
113
|
+
|
|
114
|
+
draw.rectangle(
|
|
115
|
+
[(cx0, cy0), (cx1, cy1)],
|
|
116
|
+
outline=self.params.row_outline,
|
|
117
|
+
fill=self.params.row_color,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def _draw_table_cols(
|
|
121
|
+
self,
|
|
122
|
+
table: TableItem,
|
|
123
|
+
page_image: Image,
|
|
124
|
+
page_height: float,
|
|
125
|
+
scale_x: float,
|
|
126
|
+
scale_y: float,
|
|
127
|
+
):
|
|
128
|
+
"""Draw individual table cells."""
|
|
129
|
+
draw = ImageDraw.Draw(page_image, "RGBA")
|
|
130
|
+
|
|
131
|
+
cols = table.data.get_column_bounding_boxes()
|
|
132
|
+
|
|
133
|
+
for cid, bbox in cols.items():
|
|
134
|
+
|
|
135
|
+
tl_bbox = bbox.to_top_left_origin(page_height=page_height)
|
|
136
|
+
|
|
137
|
+
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
|
|
138
|
+
cx0 *= scale_x
|
|
139
|
+
cx1 *= scale_x
|
|
140
|
+
cy0 *= scale_y
|
|
141
|
+
cy1 *= scale_y
|
|
142
|
+
|
|
143
|
+
draw.rectangle(
|
|
144
|
+
[(cx0, cy0), (cx1, cy1)],
|
|
145
|
+
outline=self.params.col_outline,
|
|
146
|
+
fill=self.params.col_color,
|
|
147
|
+
)
|
|
148
|
+
|
|
62
149
|
def _draw_doc_tables(
|
|
63
150
|
self,
|
|
64
151
|
doc: DoclingDocument,
|
|
@@ -108,6 +195,24 @@ class TableVisualizer(BaseVisualizer):
|
|
|
108
195
|
scale_y=image.height / doc.pages[page_nr].size.height,
|
|
109
196
|
)
|
|
110
197
|
|
|
198
|
+
if self.params.show_rows:
|
|
199
|
+
self._draw_table_rows(
|
|
200
|
+
table=elem,
|
|
201
|
+
page_height=doc.pages[page_nr].size.height,
|
|
202
|
+
page_image=image,
|
|
203
|
+
scale_x=image.width / doc.pages[page_nr].size.width,
|
|
204
|
+
scale_y=image.height / doc.pages[page_nr].size.height,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if self.params.show_cols:
|
|
208
|
+
self._draw_table_cols(
|
|
209
|
+
table=elem,
|
|
210
|
+
page_height=doc.pages[page_nr].size.height,
|
|
211
|
+
page_image=image,
|
|
212
|
+
scale_x=image.width / doc.pages[page_nr].size.width,
|
|
213
|
+
scale_y=image.height / doc.pages[page_nr].size.height,
|
|
214
|
+
)
|
|
215
|
+
|
|
111
216
|
else:
|
|
112
217
|
raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
|
|
113
218
|
|
|
@@ -7,26 +7,78 @@
|
|
|
7
7
|
|
|
8
8
|
from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
|
|
9
9
|
from .document import (
|
|
10
|
+
BaseAnnotation,
|
|
11
|
+
ChartBar,
|
|
12
|
+
ChartLine,
|
|
13
|
+
ChartPoint,
|
|
14
|
+
ChartSlice,
|
|
15
|
+
ChartStackedBar,
|
|
10
16
|
CodeItem,
|
|
17
|
+
ContentLayer,
|
|
18
|
+
DescriptionAnnotation,
|
|
11
19
|
DocItem,
|
|
12
20
|
DoclingDocument,
|
|
21
|
+
DocTagsDocument,
|
|
22
|
+
DocTagsPage,
|
|
13
23
|
DocumentOrigin,
|
|
14
24
|
FloatingItem,
|
|
25
|
+
Formatting,
|
|
26
|
+
FormItem,
|
|
27
|
+
FormulaItem,
|
|
28
|
+
GraphCell,
|
|
29
|
+
GraphData,
|
|
30
|
+
GraphLink,
|
|
15
31
|
GroupItem,
|
|
16
32
|
ImageRef,
|
|
33
|
+
InlineGroup,
|
|
17
34
|
KeyValueItem,
|
|
35
|
+
ListItem,
|
|
36
|
+
MiscAnnotation,
|
|
18
37
|
NodeItem,
|
|
38
|
+
OrderedList,
|
|
19
39
|
PageItem,
|
|
40
|
+
PictureBarChartData,
|
|
41
|
+
PictureChartData,
|
|
20
42
|
PictureClassificationClass,
|
|
21
43
|
PictureClassificationData,
|
|
22
44
|
PictureDataType,
|
|
23
45
|
PictureItem,
|
|
46
|
+
PictureLineChartData,
|
|
47
|
+
PictureMoleculeData,
|
|
48
|
+
PicturePieChartData,
|
|
49
|
+
PictureScatterChartData,
|
|
50
|
+
PictureStackedBarChartData,
|
|
51
|
+
PictureTabularChartData,
|
|
24
52
|
ProvenanceItem,
|
|
25
53
|
RefItem,
|
|
54
|
+
Script,
|
|
26
55
|
SectionHeaderItem,
|
|
27
56
|
TableCell,
|
|
28
57
|
TableData,
|
|
29
58
|
TableItem,
|
|
30
59
|
TextItem,
|
|
60
|
+
TitleItem,
|
|
61
|
+
UnorderedList,
|
|
31
62
|
)
|
|
32
|
-
from .labels import
|
|
63
|
+
from .labels import (
|
|
64
|
+
CodeLanguageLabel,
|
|
65
|
+
DocItemLabel,
|
|
66
|
+
GraphCellLabel,
|
|
67
|
+
GraphLinkLabel,
|
|
68
|
+
GroupLabel,
|
|
69
|
+
PictureClassificationLabel,
|
|
70
|
+
TableCellLabel,
|
|
71
|
+
)
|
|
72
|
+
from .page import (
|
|
73
|
+
BoundingRectangle,
|
|
74
|
+
ColorMixin,
|
|
75
|
+
ColorRGBA,
|
|
76
|
+
Coord2D,
|
|
77
|
+
OrderedElement,
|
|
78
|
+
PdfCellRenderingMode,
|
|
79
|
+
PdfPageBoundaryType,
|
|
80
|
+
TextCell,
|
|
81
|
+
TextCellUnit,
|
|
82
|
+
TextDirection,
|
|
83
|
+
)
|
|
84
|
+
from .tokens import DocumentToken, TableToken
|
|
@@ -38,7 +38,7 @@ from typing_extensions import Annotated, Self, deprecated
|
|
|
38
38
|
from docling_core.search.package import VERSION_PATTERN
|
|
39
39
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
40
40
|
from docling_core.types.doc import BoundingBox, Size
|
|
41
|
-
from docling_core.types.doc.base import ImageRefMode
|
|
41
|
+
from docling_core.types.doc.base import CoordOrigin, ImageRefMode
|
|
42
42
|
from docling_core.types.doc.labels import (
|
|
43
43
|
CodeLanguageLabel,
|
|
44
44
|
DocItemLabel,
|
|
@@ -372,6 +372,119 @@ class TableData(BaseModel): # TBD
|
|
|
372
372
|
|
|
373
373
|
return table_data
|
|
374
374
|
|
|
375
|
+
def get_row_bounding_boxes(self) -> dict[int, BoundingBox]:
|
|
376
|
+
"""Get the minimal bounding box for each row in the table.
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
List[Optional[BoundingBox]]: A list where each element is the minimal
|
|
380
|
+
bounding box that encompasses all cells in that row, or None if no
|
|
381
|
+
cells in the row have bounding boxes.
|
|
382
|
+
"""
|
|
383
|
+
coords = []
|
|
384
|
+
for cell in self.table_cells:
|
|
385
|
+
if cell.bbox is not None:
|
|
386
|
+
coords.append(cell.bbox.coord_origin)
|
|
387
|
+
|
|
388
|
+
if len(set(coords)) > 1:
|
|
389
|
+
raise ValueError(
|
|
390
|
+
"All bounding boxes must have the same \
|
|
391
|
+
CoordOrigin to compute their union."
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
row_bboxes: dict[int, BoundingBox] = {}
|
|
395
|
+
|
|
396
|
+
for row_idx in range(self.num_rows):
|
|
397
|
+
row_cells_with_bbox: dict[int, list[BoundingBox]] = {}
|
|
398
|
+
|
|
399
|
+
# Collect all cells in this row that have bounding boxes
|
|
400
|
+
for cell in self.table_cells:
|
|
401
|
+
|
|
402
|
+
if (
|
|
403
|
+
cell.bbox is not None
|
|
404
|
+
and cell.start_row_offset_idx <= row_idx < cell.end_row_offset_idx
|
|
405
|
+
):
|
|
406
|
+
|
|
407
|
+
row_span = cell.end_row_offset_idx - cell.start_row_offset_idx
|
|
408
|
+
if row_span in row_cells_with_bbox:
|
|
409
|
+
row_cells_with_bbox[row_span].append(cell.bbox)
|
|
410
|
+
else:
|
|
411
|
+
row_cells_with_bbox[row_span] = [cell.bbox]
|
|
412
|
+
|
|
413
|
+
# Calculate the enclosing bounding box for this row
|
|
414
|
+
if len(row_cells_with_bbox) > 0:
|
|
415
|
+
min_row_span = min(row_cells_with_bbox.keys())
|
|
416
|
+
row_bbox: BoundingBox = BoundingBox.enclosing_bbox(
|
|
417
|
+
row_cells_with_bbox[min_row_span]
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
for rspan, bboxs in row_cells_with_bbox.items():
|
|
421
|
+
for bbox in bboxs:
|
|
422
|
+
row_bbox.l = min(row_bbox.l, bbox.l)
|
|
423
|
+
row_bbox.r = max(row_bbox.r, bbox.r)
|
|
424
|
+
|
|
425
|
+
row_bboxes[row_idx] = row_bbox
|
|
426
|
+
|
|
427
|
+
return row_bboxes
|
|
428
|
+
|
|
429
|
+
def get_column_bounding_boxes(self) -> dict[int, BoundingBox]:
|
|
430
|
+
"""Get the minimal bounding box for each column in the table.
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
List[Optional[BoundingBox]]: A list where each element is the minimal
|
|
434
|
+
bounding box that encompasses all cells in that column, or None if no
|
|
435
|
+
cells in the column have bounding boxes.
|
|
436
|
+
"""
|
|
437
|
+
coords = []
|
|
438
|
+
for cell in self.table_cells:
|
|
439
|
+
if cell.bbox is not None:
|
|
440
|
+
coords.append(cell.bbox.coord_origin)
|
|
441
|
+
|
|
442
|
+
if len(set(coords)) > 1:
|
|
443
|
+
raise ValueError(
|
|
444
|
+
"All bounding boxes must have the same \
|
|
445
|
+
CoordOrigin to compute their union."
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
col_bboxes: dict[int, BoundingBox] = {}
|
|
449
|
+
|
|
450
|
+
for col_idx in range(self.num_cols):
|
|
451
|
+
col_cells_with_bbox: dict[int, list[BoundingBox]] = {}
|
|
452
|
+
|
|
453
|
+
# Collect all cells in this row that have bounding boxes
|
|
454
|
+
for cell in self.table_cells:
|
|
455
|
+
|
|
456
|
+
if (
|
|
457
|
+
cell.bbox is not None
|
|
458
|
+
and cell.start_col_offset_idx <= col_idx < cell.end_col_offset_idx
|
|
459
|
+
):
|
|
460
|
+
|
|
461
|
+
col_span = cell.end_col_offset_idx - cell.start_col_offset_idx
|
|
462
|
+
if col_span in col_cells_with_bbox:
|
|
463
|
+
col_cells_with_bbox[col_span].append(cell.bbox)
|
|
464
|
+
else:
|
|
465
|
+
col_cells_with_bbox[col_span] = [cell.bbox]
|
|
466
|
+
|
|
467
|
+
# Calculate the enclosing bounding box for this row
|
|
468
|
+
if len(col_cells_with_bbox) > 0:
|
|
469
|
+
min_col_span = min(col_cells_with_bbox.keys())
|
|
470
|
+
col_bbox: BoundingBox = BoundingBox.enclosing_bbox(
|
|
471
|
+
col_cells_with_bbox[min_col_span]
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
for rspan, bboxs in col_cells_with_bbox.items():
|
|
475
|
+
for bbox in bboxs:
|
|
476
|
+
if bbox.coord_origin == CoordOrigin.TOPLEFT:
|
|
477
|
+
col_bbox.b = max(col_bbox.b, bbox.b)
|
|
478
|
+
col_bbox.t = min(col_bbox.t, bbox.t)
|
|
479
|
+
|
|
480
|
+
elif bbox.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
481
|
+
col_bbox.b = min(col_bbox.b, bbox.b)
|
|
482
|
+
col_bbox.t = max(col_bbox.t, bbox.t)
|
|
483
|
+
|
|
484
|
+
col_bboxes[col_idx] = col_bbox
|
|
485
|
+
|
|
486
|
+
return col_bboxes
|
|
487
|
+
|
|
375
488
|
|
|
376
489
|
class PictureTabularChartData(PictureChartData):
|
|
377
490
|
"""Base class for picture chart data.
|
|
@@ -4056,6 +4169,7 @@ class DoclingDocument(BaseModel):
|
|
|
4056
4169
|
add_table_cell_location: bool = False,
|
|
4057
4170
|
add_table_cell_text: bool = True,
|
|
4058
4171
|
minified: bool = False,
|
|
4172
|
+
pages: Optional[set[int]] = None,
|
|
4059
4173
|
) -> str:
|
|
4060
4174
|
r"""Exports the document content to a DocumentToken format.
|
|
4061
4175
|
|
|
@@ -4074,6 +4188,7 @@ class DoclingDocument(BaseModel):
|
|
|
4074
4188
|
:param # table specific flagsadd_table_cell_location: bool
|
|
4075
4189
|
:param add_table_cell_text: bool: (Default value = True)
|
|
4076
4190
|
:param minified: bool: (Default value = False)
|
|
4191
|
+
:param pages: set[int]: (Default value = None)
|
|
4077
4192
|
:returns: The content of the document formatted as a DocTags string.
|
|
4078
4193
|
:rtype: str
|
|
4079
4194
|
"""
|
|
@@ -4098,6 +4213,7 @@ class DoclingDocument(BaseModel):
|
|
|
4098
4213
|
add_page_break=add_page_index,
|
|
4099
4214
|
add_table_cell_location=add_table_cell_location,
|
|
4100
4215
|
add_table_cell_text=add_table_cell_text,
|
|
4216
|
+
pages=pages,
|
|
4101
4217
|
mode=(
|
|
4102
4218
|
DocTagsParams.Mode.MINIFIED
|
|
4103
4219
|
if minified
|
|
@@ -4237,7 +4353,9 @@ class DoclingDocument(BaseModel):
|
|
|
4237
4353
|
return pitem
|
|
4238
4354
|
|
|
4239
4355
|
def get_visualization(
|
|
4240
|
-
self,
|
|
4356
|
+
self,
|
|
4357
|
+
show_label: bool = True,
|
|
4358
|
+
show_branch_numbering: bool = False,
|
|
4241
4359
|
) -> dict[Optional[int], PILImage.Image]:
|
|
4242
4360
|
"""Get visualization of the document as images by page."""
|
|
4243
4361
|
from docling_core.transforms.visualizer.layout_visualizer import (
|
|
@@ -4253,6 +4371,9 @@ class DoclingDocument(BaseModel):
|
|
|
4253
4371
|
show_label=show_label,
|
|
4254
4372
|
),
|
|
4255
4373
|
),
|
|
4374
|
+
params=ReadingOrderVisualizer.Params(
|
|
4375
|
+
show_branch_numbering=show_branch_numbering,
|
|
4376
|
+
),
|
|
4256
4377
|
)
|
|
4257
4378
|
images = visualizer.get_visualization(doc=self)
|
|
4258
4379
|
|
|
@@ -4343,3 +4464,67 @@ class DoclingDocument(BaseModel):
|
|
|
4343
4464
|
hyperlink=li.hyperlink,
|
|
4344
4465
|
)
|
|
4345
4466
|
return self
|
|
4467
|
+
|
|
4468
|
+
def _normalize_references(self) -> None:
|
|
4469
|
+
"""Normalize ref numbering by ordering node items as per iterate_items()."""
|
|
4470
|
+
new_body = GroupItem(**self.body.model_dump(exclude={"children"}))
|
|
4471
|
+
|
|
4472
|
+
item_lists: dict[str, list[NodeItem]] = {
|
|
4473
|
+
"groups": [],
|
|
4474
|
+
"texts": [],
|
|
4475
|
+
"pictures": [],
|
|
4476
|
+
"tables": [],
|
|
4477
|
+
"key_value_items": [],
|
|
4478
|
+
"form_items": [],
|
|
4479
|
+
}
|
|
4480
|
+
orig_ref_to_new_ref: dict[str, str] = {}
|
|
4481
|
+
|
|
4482
|
+
# collect items in traversal order
|
|
4483
|
+
for item, _ in self.iterate_items(
|
|
4484
|
+
with_groups=True,
|
|
4485
|
+
traverse_pictures=True,
|
|
4486
|
+
included_content_layers={c for c in ContentLayer},
|
|
4487
|
+
):
|
|
4488
|
+
key = item.self_ref.split("/")[1]
|
|
4489
|
+
is_body = key == "body"
|
|
4490
|
+
new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}"
|
|
4491
|
+
# register cref mapping:
|
|
4492
|
+
orig_ref_to_new_ref[item.self_ref] = new_cref
|
|
4493
|
+
|
|
4494
|
+
if not is_body:
|
|
4495
|
+
new_item = copy.deepcopy(item)
|
|
4496
|
+
new_item.children = []
|
|
4497
|
+
|
|
4498
|
+
# put item in the right list
|
|
4499
|
+
item_lists[key].append(new_item)
|
|
4500
|
+
|
|
4501
|
+
# update item's self reference
|
|
4502
|
+
new_item.self_ref = new_cref
|
|
4503
|
+
|
|
4504
|
+
if item.parent:
|
|
4505
|
+
# set item's parent
|
|
4506
|
+
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
|
|
4507
|
+
new_item.parent = RefItem(cref=new_parent_cref)
|
|
4508
|
+
|
|
4509
|
+
# add item to parent's children
|
|
4510
|
+
path_components = new_parent_cref.split("/")
|
|
4511
|
+
num_components = len(path_components)
|
|
4512
|
+
parent_node: NodeItem
|
|
4513
|
+
if num_components == 3:
|
|
4514
|
+
_, parent_key, parent_index_str = path_components
|
|
4515
|
+
parent_index = int(parent_index_str)
|
|
4516
|
+
parent_node = item_lists[parent_key][parent_index]
|
|
4517
|
+
elif num_components == 2 and path_components[1] == "body":
|
|
4518
|
+
parent_node = new_body
|
|
4519
|
+
else:
|
|
4520
|
+
raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
|
|
4521
|
+
parent_node.children.append(RefItem(cref=new_cref))
|
|
4522
|
+
|
|
4523
|
+
# update document
|
|
4524
|
+
self.groups = item_lists["groups"] # type: ignore
|
|
4525
|
+
self.texts = item_lists["texts"] # type: ignore
|
|
4526
|
+
self.pictures = item_lists["pictures"] # type: ignore
|
|
4527
|
+
self.tables = item_lists["tables"] # type: ignore
|
|
4528
|
+
self.key_value_items = item_lists["key_value_items"] # type: ignore
|
|
4529
|
+
self.form_items = item_lists["form_items"] # type: ignore
|
|
4530
|
+
self.body = new_body
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.38.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -20,7 +20,7 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
|
|
|
20
20
|
docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
|
|
21
21
|
docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
|
|
22
22
|
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=7Fpwwsn2BoiR12KGPrn8fU1uuhqBLp85MRLMF0aIsL8,8281
|
|
23
|
-
docling_core/transforms/chunker/hybrid_chunker.py,sha256=
|
|
23
|
+
docling_core/transforms/chunker/hybrid_chunker.py,sha256=xjkz8hy3tXXzkJzf7QMFOEq_v8V7Jcs9tCY0Mxjge74,12548
|
|
24
24
|
docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
|
|
25
25
|
docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP7xEBtii7DYcib0iECq5o,575
|
|
26
26
|
docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZw3SBCoqJHM2Ihb65eiM29O9BR6o,2506
|
|
@@ -29,19 +29,19 @@ docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3
|
|
|
29
29
|
docling_core/transforms/serializer/base.py,sha256=ZFIiZeplL-QbBs9EDUb1awqxapQ23PsApVetJtAs7Vs,6891
|
|
30
30
|
docling_core/transforms/serializer/common.py,sha256=WP-qO-woidrKyvZ56m0vlKMysoLrMzzZtHSCIwsl3ek,19119
|
|
31
31
|
docling_core/transforms/serializer/doctags.py,sha256=PuAExlP-2HxcDSP_R_phtYQU0yKBW94RrPgb85IUxck,19905
|
|
32
|
-
docling_core/transforms/serializer/html.py,sha256=
|
|
32
|
+
docling_core/transforms/serializer/html.py,sha256=SZgQa0QnknEoRwMFLdgmVsLQqLF2rQl3D7XyEZzUHCE,37151
|
|
33
33
|
docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
|
|
34
34
|
docling_core/transforms/serializer/markdown.py,sha256=wfMNrjA4wMehWLCejAhEN1eQPRixUO1SyL6ojkKkzZY,20614
|
|
35
35
|
docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
|
|
36
36
|
docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
|
|
37
|
-
docling_core/transforms/visualizer/layout_visualizer.py,sha256=
|
|
38
|
-
docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=
|
|
39
|
-
docling_core/transforms/visualizer/table_visualizer.py,sha256=
|
|
37
|
+
docling_core/transforms/visualizer/layout_visualizer.py,sha256=zHzQTWcy-z1J2BcsjvakLkrp8pgStgnxhDl8YqIAotY,8035
|
|
38
|
+
docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=muqmaxOBao39X3Dut0934NAjU3I4v3JN5VzzdjmoGRY,7776
|
|
39
|
+
docling_core/transforms/visualizer/table_visualizer.py,sha256=iJPjk-XQSSCH3oujcjPMz-redAwNNHseZ41lFyd-u3k,8097
|
|
40
40
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
41
41
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
42
|
-
docling_core/types/doc/__init__.py,sha256=
|
|
42
|
+
docling_core/types/doc/__init__.py,sha256=pchsIq-9FH_kCTyuyDdB8L4yV77pmnxPwT7399xrqxI,1626
|
|
43
43
|
docling_core/types/doc/base.py,sha256=ndXquBrOKTFQApIJ5s2-zstj3xlVKRbJDSId0KOQnUg,14817
|
|
44
|
-
docling_core/types/doc/document.py,sha256=
|
|
44
|
+
docling_core/types/doc/document.py,sha256=JPh-9MqfOxThP5njvXZAY8sxQyhiPJLjDsSJviggItc,156829
|
|
45
45
|
docling_core/types/doc/labels.py,sha256=JiciRK7_DOkebsrfQ6PVCvS__TsKgWn1ANk84BeB14k,7359
|
|
46
46
|
docling_core/types/doc/page.py,sha256=1JMPwglaTITBvg959L_pcWPb-fXoDYGh-e_tGZMzVMQ,41060
|
|
47
47
|
docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
|
|
@@ -74,9 +74,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
74
74
|
docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
|
|
75
75
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
76
76
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
77
|
-
docling_core-2.
|
|
78
|
-
docling_core-2.
|
|
79
|
-
docling_core-2.
|
|
80
|
-
docling_core-2.
|
|
81
|
-
docling_core-2.
|
|
82
|
-
docling_core-2.
|
|
77
|
+
docling_core-2.38.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
78
|
+
docling_core-2.38.0.dist-info/METADATA,sha256=llcycAVzvc09CX0igt4VIGrGWT8UuMjnWN5rrQoEJ6s,6453
|
|
79
|
+
docling_core-2.38.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
80
|
+
docling_core-2.38.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
|
|
81
|
+
docling_core-2.38.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
|
|
82
|
+
docling_core-2.38.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|