docling-core 2.35.0__tar.gz → 2.37.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.35.0 → docling_core-2.37.0}/PKG-INFO +1 -1
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/serializer/html.py +1 -1
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/visualizer/layout_visualizer.py +1 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +11 -1
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/visualizer/table_visualizer.py +109 -4
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/doc/document.py +141 -3
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/doc/labels.py +6 -2
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.35.0 → docling_core-2.37.0}/pyproject.toml +1 -1
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_visualization.py +17 -1
- {docling_core-2.35.0 → docling_core-2.37.0}/LICENSE +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/README.md +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/py.typed +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/search/package.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/serializer/base.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/serializer/common.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/serializer/doctags.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/serializer/markdown.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/base.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core/utils/validators.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core.egg-info/SOURCES.txt +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/setup.cfg +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_base.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_collection.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_doc_base.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_doc_schema.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_docling_doc.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_doctags_load.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_nlp_qa.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_page.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_rec_schema.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_search_meta.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_serialization.py +0 -0
- {docling_core-2.35.0 → docling_core-2.37.0}/test/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.37.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -340,7 +340,7 @@ class HTMLTableSerializer(BaseTableSerializer):
|
|
|
340
340
|
|
|
341
341
|
content = html.escape(cell.text.strip())
|
|
342
342
|
celltag = "td"
|
|
343
|
-
if cell.column_header:
|
|
343
|
+
if cell.column_header or cell.row_header or cell.row_section:
|
|
344
344
|
celltag = "th"
|
|
345
345
|
|
|
346
346
|
opening_tag = f"{celltag}"
|
{docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/visualizer/layout_visualizer.py
RENAMED
|
@@ -40,6 +40,7 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
40
40
|
"""Layout visualization parameters."""
|
|
41
41
|
|
|
42
42
|
show_label: bool = True
|
|
43
|
+
content_layers: set[ContentLayer] = {cl for cl in ContentLayer}
|
|
43
44
|
|
|
44
45
|
base_visualizer: Optional[BaseVisualizer] = None
|
|
45
46
|
params: Params = Params()
|
|
@@ -5,6 +5,7 @@ from typing import Optional
|
|
|
5
5
|
|
|
6
6
|
from PIL import ImageDraw
|
|
7
7
|
from PIL.Image import Image
|
|
8
|
+
from pydantic import BaseModel
|
|
8
9
|
from typing_extensions import override
|
|
9
10
|
|
|
10
11
|
from docling_core.transforms.visualizer.base import BaseVisualizer
|
|
@@ -14,7 +15,16 @@ from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocume
|
|
|
14
15
|
class ReadingOrderVisualizer(BaseVisualizer):
|
|
15
16
|
"""Reading order visualizer."""
|
|
16
17
|
|
|
18
|
+
class Params(BaseModel):
|
|
19
|
+
"""Layout visualization parameters."""
|
|
20
|
+
|
|
21
|
+
show_label: bool = True
|
|
22
|
+
content_layers: set[ContentLayer] = {
|
|
23
|
+
cl for cl in ContentLayer if cl != ContentLayer.BACKGROUND
|
|
24
|
+
}
|
|
25
|
+
|
|
17
26
|
base_visualizer: Optional[BaseVisualizer] = None
|
|
27
|
+
params: Params = Params()
|
|
18
28
|
|
|
19
29
|
def _draw_arrow(
|
|
20
30
|
self,
|
|
@@ -71,7 +81,7 @@ class ReadingOrderVisualizer(BaseVisualizer):
|
|
|
71
81
|
my_images: dict[Optional[int], Image] = images or {}
|
|
72
82
|
prev_page = None
|
|
73
83
|
for elem, _ in doc.iterate_items(
|
|
74
|
-
included_content_layers=
|
|
84
|
+
included_content_layers=self.params.content_layers,
|
|
75
85
|
):
|
|
76
86
|
if not isinstance(elem, DocItem):
|
|
77
87
|
continue
|
{docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/visualizer/table_visualizer.py
RENAMED
|
@@ -23,8 +23,23 @@ class TableVisualizer(BaseVisualizer):
|
|
|
23
23
|
|
|
24
24
|
# show_Label: bool = False
|
|
25
25
|
show_cells: bool = True
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
show_rows: bool = False
|
|
27
|
+
show_cols: bool = False
|
|
28
|
+
|
|
29
|
+
cell_color: tuple[int, int, int, int] = (256, 0, 0, 32)
|
|
30
|
+
cell_outline: tuple[int, int, int, int] = (256, 0, 0, 128)
|
|
31
|
+
|
|
32
|
+
row_color: tuple[int, int, int, int] = (256, 0, 0, 32)
|
|
33
|
+
row_outline: tuple[int, int, int, int] = (256, 0, 0, 128)
|
|
34
|
+
|
|
35
|
+
row_header_color: tuple[int, int, int, int] = (0, 256, 0, 32)
|
|
36
|
+
row_header_outline: tuple[int, int, int, int] = (0, 256, 0, 128)
|
|
37
|
+
|
|
38
|
+
col_color: tuple[int, int, int, int] = (0, 256, 0, 32)
|
|
39
|
+
col_outline: tuple[int, int, int, int] = (0, 256, 0, 128)
|
|
40
|
+
|
|
41
|
+
col_header_color: tuple[int, int, int, int] = (0, 0, 256, 32)
|
|
42
|
+
col_header_outline: tuple[int, int, int, int] = (0, 0, 256, 128)
|
|
28
43
|
|
|
29
44
|
base_visualizer: Optional[BaseVisualizer] = None
|
|
30
45
|
params: Params = Params()
|
|
@@ -45,7 +60,21 @@ class TableVisualizer(BaseVisualizer):
|
|
|
45
60
|
|
|
46
61
|
tl_bbox = cell.bbox.to_top_left_origin(page_height=page_height)
|
|
47
62
|
|
|
48
|
-
cell_color =
|
|
63
|
+
cell_color = self.params.cell_color # Transparent black for cells
|
|
64
|
+
cell_outline = self.params.cell_outline
|
|
65
|
+
if cell.column_header:
|
|
66
|
+
cell_color = (
|
|
67
|
+
self.params.col_header_color
|
|
68
|
+
) # Transparent black for cells
|
|
69
|
+
cell_outline = self.params.col_header_outline
|
|
70
|
+
if cell.row_header:
|
|
71
|
+
cell_color = (
|
|
72
|
+
self.params.row_header_color
|
|
73
|
+
) # Transparent black for cells
|
|
74
|
+
cell_outline = self.params.row_header_outline
|
|
75
|
+
if cell.row_section:
|
|
76
|
+
cell_color = self.params.row_header_color
|
|
77
|
+
cell_outline = self.params.row_header_outline
|
|
49
78
|
|
|
50
79
|
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
|
|
51
80
|
cx0 *= scale_x
|
|
@@ -55,10 +84,68 @@ class TableVisualizer(BaseVisualizer):
|
|
|
55
84
|
|
|
56
85
|
draw.rectangle(
|
|
57
86
|
[(cx0, cy0), (cx1, cy1)],
|
|
58
|
-
outline=
|
|
87
|
+
outline=cell_outline,
|
|
59
88
|
fill=cell_color,
|
|
60
89
|
)
|
|
61
90
|
|
|
91
|
+
def _draw_table_rows(
|
|
92
|
+
self,
|
|
93
|
+
table: TableItem,
|
|
94
|
+
page_image: Image,
|
|
95
|
+
page_height: float,
|
|
96
|
+
scale_x: float,
|
|
97
|
+
scale_y: float,
|
|
98
|
+
):
|
|
99
|
+
"""Draw individual table cells."""
|
|
100
|
+
draw = ImageDraw.Draw(page_image, "RGBA")
|
|
101
|
+
|
|
102
|
+
rows = table.data.get_row_bounding_boxes()
|
|
103
|
+
|
|
104
|
+
for rid, bbox in rows.items():
|
|
105
|
+
|
|
106
|
+
tl_bbox = bbox.to_top_left_origin(page_height=page_height)
|
|
107
|
+
|
|
108
|
+
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
|
|
109
|
+
cx0 *= scale_x
|
|
110
|
+
cx1 *= scale_x
|
|
111
|
+
cy0 *= scale_y
|
|
112
|
+
cy1 *= scale_y
|
|
113
|
+
|
|
114
|
+
draw.rectangle(
|
|
115
|
+
[(cx0, cy0), (cx1, cy1)],
|
|
116
|
+
outline=self.params.row_outline,
|
|
117
|
+
fill=self.params.row_color,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def _draw_table_cols(
|
|
121
|
+
self,
|
|
122
|
+
table: TableItem,
|
|
123
|
+
page_image: Image,
|
|
124
|
+
page_height: float,
|
|
125
|
+
scale_x: float,
|
|
126
|
+
scale_y: float,
|
|
127
|
+
):
|
|
128
|
+
"""Draw individual table cells."""
|
|
129
|
+
draw = ImageDraw.Draw(page_image, "RGBA")
|
|
130
|
+
|
|
131
|
+
cols = table.data.get_column_bounding_boxes()
|
|
132
|
+
|
|
133
|
+
for cid, bbox in cols.items():
|
|
134
|
+
|
|
135
|
+
tl_bbox = bbox.to_top_left_origin(page_height=page_height)
|
|
136
|
+
|
|
137
|
+
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
|
|
138
|
+
cx0 *= scale_x
|
|
139
|
+
cx1 *= scale_x
|
|
140
|
+
cy0 *= scale_y
|
|
141
|
+
cy1 *= scale_y
|
|
142
|
+
|
|
143
|
+
draw.rectangle(
|
|
144
|
+
[(cx0, cy0), (cx1, cy1)],
|
|
145
|
+
outline=self.params.col_outline,
|
|
146
|
+
fill=self.params.col_color,
|
|
147
|
+
)
|
|
148
|
+
|
|
62
149
|
def _draw_doc_tables(
|
|
63
150
|
self,
|
|
64
151
|
doc: DoclingDocument,
|
|
@@ -108,6 +195,24 @@ class TableVisualizer(BaseVisualizer):
|
|
|
108
195
|
scale_y=image.height / doc.pages[page_nr].size.height,
|
|
109
196
|
)
|
|
110
197
|
|
|
198
|
+
if self.params.show_rows:
|
|
199
|
+
self._draw_table_rows(
|
|
200
|
+
table=elem,
|
|
201
|
+
page_height=doc.pages[page_nr].size.height,
|
|
202
|
+
page_image=image,
|
|
203
|
+
scale_x=image.width / doc.pages[page_nr].size.width,
|
|
204
|
+
scale_y=image.height / doc.pages[page_nr].size.height,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if self.params.show_cols:
|
|
208
|
+
self._draw_table_cols(
|
|
209
|
+
table=elem,
|
|
210
|
+
page_height=doc.pages[page_nr].size.height,
|
|
211
|
+
page_image=image,
|
|
212
|
+
scale_x=image.width / doc.pages[page_nr].size.width,
|
|
213
|
+
scale_y=image.height / doc.pages[page_nr].size.height,
|
|
214
|
+
)
|
|
215
|
+
|
|
111
216
|
else:
|
|
112
217
|
raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
|
|
113
218
|
|
|
@@ -38,7 +38,7 @@ from typing_extensions import Annotated, Self, deprecated
|
|
|
38
38
|
from docling_core.search.package import VERSION_PATTERN
|
|
39
39
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
40
40
|
from docling_core.types.doc import BoundingBox, Size
|
|
41
|
-
from docling_core.types.doc.base import ImageRefMode
|
|
41
|
+
from docling_core.types.doc.base import CoordOrigin, ImageRefMode
|
|
42
42
|
from docling_core.types.doc.labels import (
|
|
43
43
|
CodeLanguageLabel,
|
|
44
44
|
DocItemLabel,
|
|
@@ -372,6 +372,119 @@ class TableData(BaseModel): # TBD
|
|
|
372
372
|
|
|
373
373
|
return table_data
|
|
374
374
|
|
|
375
|
+
def get_row_bounding_boxes(self) -> dict[int, BoundingBox]:
|
|
376
|
+
"""Get the minimal bounding box for each row in the table.
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
List[Optional[BoundingBox]]: A list where each element is the minimal
|
|
380
|
+
bounding box that encompasses all cells in that row, or None if no
|
|
381
|
+
cells in the row have bounding boxes.
|
|
382
|
+
"""
|
|
383
|
+
coords = []
|
|
384
|
+
for cell in self.table_cells:
|
|
385
|
+
if cell.bbox is not None:
|
|
386
|
+
coords.append(cell.bbox.coord_origin)
|
|
387
|
+
|
|
388
|
+
if len(set(coords)) > 1:
|
|
389
|
+
raise ValueError(
|
|
390
|
+
"All bounding boxes must have the same \
|
|
391
|
+
CoordOrigin to compute their union."
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
row_bboxes: dict[int, BoundingBox] = {}
|
|
395
|
+
|
|
396
|
+
for row_idx in range(self.num_rows):
|
|
397
|
+
row_cells_with_bbox: dict[int, list[BoundingBox]] = {}
|
|
398
|
+
|
|
399
|
+
# Collect all cells in this row that have bounding boxes
|
|
400
|
+
for cell in self.table_cells:
|
|
401
|
+
|
|
402
|
+
if (
|
|
403
|
+
cell.bbox is not None
|
|
404
|
+
and cell.start_row_offset_idx <= row_idx < cell.end_row_offset_idx
|
|
405
|
+
):
|
|
406
|
+
|
|
407
|
+
row_span = cell.end_row_offset_idx - cell.start_row_offset_idx
|
|
408
|
+
if row_span in row_cells_with_bbox:
|
|
409
|
+
row_cells_with_bbox[row_span].append(cell.bbox)
|
|
410
|
+
else:
|
|
411
|
+
row_cells_with_bbox[row_span] = [cell.bbox]
|
|
412
|
+
|
|
413
|
+
# Calculate the enclosing bounding box for this row
|
|
414
|
+
if len(row_cells_with_bbox) > 0:
|
|
415
|
+
min_row_span = min(row_cells_with_bbox.keys())
|
|
416
|
+
row_bbox: BoundingBox = BoundingBox.enclosing_bbox(
|
|
417
|
+
row_cells_with_bbox[min_row_span]
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
for rspan, bboxs in row_cells_with_bbox.items():
|
|
421
|
+
for bbox in bboxs:
|
|
422
|
+
row_bbox.l = min(row_bbox.l, bbox.l)
|
|
423
|
+
row_bbox.r = max(row_bbox.r, bbox.r)
|
|
424
|
+
|
|
425
|
+
row_bboxes[row_idx] = row_bbox
|
|
426
|
+
|
|
427
|
+
return row_bboxes
|
|
428
|
+
|
|
429
|
+
def get_column_bounding_boxes(self) -> dict[int, BoundingBox]:
|
|
430
|
+
"""Get the minimal bounding box for each column in the table.
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
List[Optional[BoundingBox]]: A list where each element is the minimal
|
|
434
|
+
bounding box that encompasses all cells in that column, or None if no
|
|
435
|
+
cells in the column have bounding boxes.
|
|
436
|
+
"""
|
|
437
|
+
coords = []
|
|
438
|
+
for cell in self.table_cells:
|
|
439
|
+
if cell.bbox is not None:
|
|
440
|
+
coords.append(cell.bbox.coord_origin)
|
|
441
|
+
|
|
442
|
+
if len(set(coords)) > 1:
|
|
443
|
+
raise ValueError(
|
|
444
|
+
"All bounding boxes must have the same \
|
|
445
|
+
CoordOrigin to compute their union."
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
col_bboxes: dict[int, BoundingBox] = {}
|
|
449
|
+
|
|
450
|
+
for col_idx in range(self.num_cols):
|
|
451
|
+
col_cells_with_bbox: dict[int, list[BoundingBox]] = {}
|
|
452
|
+
|
|
453
|
+
# Collect all cells in this row that have bounding boxes
|
|
454
|
+
for cell in self.table_cells:
|
|
455
|
+
|
|
456
|
+
if (
|
|
457
|
+
cell.bbox is not None
|
|
458
|
+
and cell.start_col_offset_idx <= col_idx < cell.end_col_offset_idx
|
|
459
|
+
):
|
|
460
|
+
|
|
461
|
+
col_span = cell.end_col_offset_idx - cell.start_col_offset_idx
|
|
462
|
+
if col_span in col_cells_with_bbox:
|
|
463
|
+
col_cells_with_bbox[col_span].append(cell.bbox)
|
|
464
|
+
else:
|
|
465
|
+
col_cells_with_bbox[col_span] = [cell.bbox]
|
|
466
|
+
|
|
467
|
+
# Calculate the enclosing bounding box for this row
|
|
468
|
+
if len(col_cells_with_bbox) > 0:
|
|
469
|
+
min_col_span = min(col_cells_with_bbox.keys())
|
|
470
|
+
col_bbox: BoundingBox = BoundingBox.enclosing_bbox(
|
|
471
|
+
col_cells_with_bbox[min_col_span]
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
for rspan, bboxs in col_cells_with_bbox.items():
|
|
475
|
+
for bbox in bboxs:
|
|
476
|
+
if bbox.coord_origin == CoordOrigin.TOPLEFT:
|
|
477
|
+
col_bbox.b = max(col_bbox.b, bbox.b)
|
|
478
|
+
col_bbox.t = min(col_bbox.t, bbox.t)
|
|
479
|
+
|
|
480
|
+
elif bbox.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
481
|
+
col_bbox.b = min(col_bbox.b, bbox.b)
|
|
482
|
+
col_bbox.t = max(col_bbox.t, bbox.t)
|
|
483
|
+
|
|
484
|
+
col_bboxes[col_idx] = col_bbox
|
|
485
|
+
|
|
486
|
+
return col_bboxes
|
|
487
|
+
|
|
375
488
|
|
|
376
489
|
class PictureTabularChartData(PictureChartData):
|
|
377
490
|
"""Base class for picture chart data.
|
|
@@ -623,6 +736,7 @@ class ContentLayer(str, Enum):
|
|
|
623
736
|
|
|
624
737
|
BODY = "body"
|
|
625
738
|
FURNITURE = "furniture"
|
|
739
|
+
BACKGROUND = "background"
|
|
626
740
|
|
|
627
741
|
|
|
628
742
|
DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
|
|
@@ -860,6 +974,7 @@ class TextItem(DocItem):
|
|
|
860
974
|
DocItemLabel.PARAGRAPH,
|
|
861
975
|
DocItemLabel.REFERENCE,
|
|
862
976
|
DocItemLabel.TEXT,
|
|
977
|
+
DocItemLabel.EMPTY_VALUE,
|
|
863
978
|
]
|
|
864
979
|
|
|
865
980
|
orig: str # untreated representation
|
|
@@ -2867,23 +2982,46 @@ class DoclingDocument(BaseModel):
|
|
|
2867
2982
|
|
|
2868
2983
|
def print_element_tree(self):
|
|
2869
2984
|
"""Print_element_tree."""
|
|
2870
|
-
for ix, (item, level) in enumerate(
|
|
2985
|
+
for ix, (item, level) in enumerate(
|
|
2986
|
+
self.iterate_items(
|
|
2987
|
+
with_groups=True,
|
|
2988
|
+
traverse_pictures=True,
|
|
2989
|
+
included_content_layers={cl for cl in ContentLayer},
|
|
2990
|
+
)
|
|
2991
|
+
):
|
|
2871
2992
|
if isinstance(item, GroupItem):
|
|
2872
2993
|
print(
|
|
2873
2994
|
" " * level,
|
|
2874
2995
|
f"{ix}: {item.label.value} with name={item.name}",
|
|
2875
2996
|
)
|
|
2997
|
+
elif isinstance(item, TextItem):
|
|
2998
|
+
print(
|
|
2999
|
+
" " * level,
|
|
3000
|
+
f"{ix}: {item.label.value}: {item.text[:min(len(item.text), 100)]}",
|
|
3001
|
+
)
|
|
3002
|
+
|
|
2876
3003
|
elif isinstance(item, DocItem):
|
|
2877
3004
|
print(" " * level, f"{ix}: {item.label.value}")
|
|
2878
3005
|
|
|
2879
3006
|
def export_to_element_tree(self) -> str:
|
|
2880
3007
|
"""Export_to_element_tree."""
|
|
2881
3008
|
texts = []
|
|
2882
|
-
for ix, (item, level) in enumerate(
|
|
3009
|
+
for ix, (item, level) in enumerate(
|
|
3010
|
+
self.iterate_items(
|
|
3011
|
+
with_groups=True,
|
|
3012
|
+
traverse_pictures=True,
|
|
3013
|
+
included_content_layers={cl for cl in ContentLayer},
|
|
3014
|
+
)
|
|
3015
|
+
):
|
|
2883
3016
|
if isinstance(item, GroupItem):
|
|
2884
3017
|
texts.append(
|
|
2885
3018
|
" " * level + f"{ix}: {item.label.value} with name={item.name}"
|
|
2886
3019
|
)
|
|
3020
|
+
elif isinstance(item, TextItem):
|
|
3021
|
+
texts.append(
|
|
3022
|
+
" " * level
|
|
3023
|
+
+ f"{ix}: {item.label.value}: {item.text[:min(len(item.text), 100)]}"
|
|
3024
|
+
)
|
|
2887
3025
|
elif isinstance(item, DocItem):
|
|
2888
3026
|
texts.append(" " * level + f"{ix}: {item.label.value}")
|
|
2889
3027
|
|
|
@@ -27,6 +27,9 @@ class DocItemLabel(str, Enum):
|
|
|
27
27
|
KEY_VALUE_REGION = "key_value_region"
|
|
28
28
|
GRADING_SCALE = "grading_scale" # for elements in forms, questionaires representing a grading scale
|
|
29
29
|
# e.g. [strongly disagree | ... | ... | strongly agree]
|
|
30
|
+
# e.g. ★★☆☆☆
|
|
31
|
+
HANDWRITTEN_TEXT = "handwritten_text"
|
|
32
|
+
EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms
|
|
30
33
|
|
|
31
34
|
# Additional labels for markup-based formats (e.g. HTML, Word)
|
|
32
35
|
PARAGRAPH = "paragraph"
|
|
@@ -60,6 +63,9 @@ class DocItemLabel(str, Enum):
|
|
|
60
63
|
DocItemLabel.KEY_VALUE_REGION: (183, 65, 14),
|
|
61
64
|
DocItemLabel.PARAGRAPH: (255, 255, 153),
|
|
62
65
|
DocItemLabel.REFERENCE: (176, 224, 230),
|
|
66
|
+
DocItemLabel.GRADING_SCALE: (255, 204, 204),
|
|
67
|
+
DocItemLabel.HANDWRITTEN_TEXT: (204, 255, 204),
|
|
68
|
+
DocItemLabel.EMPTY_VALUE: (220, 220, 220),
|
|
63
69
|
}
|
|
64
70
|
return color_map.get(label, (0, 0, 0))
|
|
65
71
|
|
|
@@ -166,7 +172,6 @@ class GraphCellLabel(str, Enum):
|
|
|
166
172
|
KEY = "key" # used to designate a key (label) of a key-value element
|
|
167
173
|
VALUE = "value" # Data value with or without explicit Key, but filled in,
|
|
168
174
|
# e.g. telephone number, address, quantity, name, date
|
|
169
|
-
EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms
|
|
170
175
|
CHECKBOX = "checkbox"
|
|
171
176
|
|
|
172
177
|
def __str__(self):
|
|
@@ -179,7 +184,6 @@ class GraphCellLabel(str, Enum):
|
|
|
179
184
|
color_map = {
|
|
180
185
|
GraphCellLabel.KEY: (255, 0, 0),
|
|
181
186
|
GraphCellLabel.VALUE: (0, 255, 0),
|
|
182
|
-
GraphCellLabel.EMPTY_VALUE: (0, 0, 255),
|
|
183
187
|
}
|
|
184
188
|
return color_map.get(label, (0, 0, 0))
|
|
185
189
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.37.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.37.0" # DO NOT EDIT, updated automatically
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
license-files = ["LICENSE"]
|
|
@@ -55,7 +55,7 @@ def test_doc_visualization_no_label():
|
|
|
55
55
|
)
|
|
56
56
|
|
|
57
57
|
|
|
58
|
-
def
|
|
58
|
+
def test_table_visualization_for_cells():
|
|
59
59
|
src = Path("./test/data/doc/2408.09869v3_enriched.json")
|
|
60
60
|
doc = DoclingDocument.load_from_json(src)
|
|
61
61
|
|
|
@@ -66,3 +66,19 @@ def test_table_visualization_no_label():
|
|
|
66
66
|
exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_table_viz_wout_lbl_p5.png",
|
|
67
67
|
actual=viz_pages[5],
|
|
68
68
|
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_table_visualization_for_rows_and_cols():
|
|
72
|
+
src = Path("./test/data/doc/2408.09869v3_enriched.json")
|
|
73
|
+
doc = DoclingDocument.load_from_json(src)
|
|
74
|
+
|
|
75
|
+
visualizer = TableVisualizer(
|
|
76
|
+
params=TableVisualizer.Params(show_cells=False, show_rows=True, show_cols=True)
|
|
77
|
+
)
|
|
78
|
+
viz_pages = visualizer.get_visualization(doc=doc)
|
|
79
|
+
|
|
80
|
+
verify(
|
|
81
|
+
exp_file=VIZ_TEST_DATA_PATH
|
|
82
|
+
/ f"{src.stem}_table_viz_wout_lbl_p5_rows_and_cols.png",
|
|
83
|
+
actual=viz_pages[5],
|
|
84
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.35.0 → docling_core-2.37.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.35.0 → docling_core-2.37.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/tokenizer/__init__.py
RENAMED
|
File without changes
|
{docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/tokenizer/base.py
RENAMED
|
File without changes
|
{docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/tokenizer/huggingface.py
RENAMED
|
File without changes
|
{docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/chunker/tokenizer/openai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.35.0 → docling_core-2.37.0}/docling_core/transforms/serializer/html_styles.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|