docling-core 2.34.2__tar.gz → 2.36.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.34.2 → docling_core-2.36.0}/PKG-INFO +1 -1
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/visualizer/layout_visualizer.py +9 -4
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +11 -1
- docling_core-2.36.0/docling_core/transforms/visualizer/table_visualizer.py +135 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/doc/document.py +27 -2
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/doc/labels.py +6 -2
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core.egg-info/SOURCES.txt +1 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/pyproject.toml +1 -1
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_visualization.py +14 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/LICENSE +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/README.md +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/py.typed +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/search/package.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/serializer/base.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/serializer/common.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/serializer/doctags.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/serializer/html.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/serializer/markdown.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/base.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core/utils/validators.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/setup.cfg +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_base.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_collection.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_doc_base.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_doc_schema.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_docling_doc.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_doctags_load.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_nlp_qa.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_page.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_rec_schema.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_search_meta.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_serialization.py +0 -0
- {docling_core-2.34.2 → docling_core-2.36.0}/test/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.36.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
{docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/visualizer/layout_visualizer.py
RENAMED
|
@@ -40,6 +40,7 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
40
40
|
"""Layout visualization parameters."""
|
|
41
41
|
|
|
42
42
|
show_label: bool = True
|
|
43
|
+
content_layers: set[ContentLayer] = {cl for cl in ContentLayer}
|
|
43
44
|
|
|
44
45
|
base_visualizer: Optional[BaseVisualizer] = None
|
|
45
46
|
params: Params = Params()
|
|
@@ -119,7 +120,10 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
119
120
|
)
|
|
120
121
|
|
|
121
122
|
def _draw_doc_layout(
|
|
122
|
-
self,
|
|
123
|
+
self,
|
|
124
|
+
doc: DoclingDocument,
|
|
125
|
+
images: Optional[dict[Optional[int], Image]] = None,
|
|
126
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
123
127
|
):
|
|
124
128
|
"""Draw the document clusters and optionaly the reading order."""
|
|
125
129
|
clusters = []
|
|
@@ -128,6 +132,9 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
128
132
|
if images is not None:
|
|
129
133
|
my_images = images
|
|
130
134
|
|
|
135
|
+
if included_content_layers is None:
|
|
136
|
+
included_content_layers = {c for c in ContentLayer}
|
|
137
|
+
|
|
131
138
|
# Initialise `my_images` beforehand: sometimes, you have the
|
|
132
139
|
# page-images but no DocItems!
|
|
133
140
|
for page_nr, page in doc.pages.items():
|
|
@@ -141,9 +148,7 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
141
148
|
prev_image = None
|
|
142
149
|
prev_page_nr = None
|
|
143
150
|
for idx, (elem, _) in enumerate(
|
|
144
|
-
doc.iterate_items(
|
|
145
|
-
included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}
|
|
146
|
-
)
|
|
151
|
+
doc.iterate_items(included_content_layers=included_content_layers)
|
|
147
152
|
):
|
|
148
153
|
if not isinstance(elem, DocItem):
|
|
149
154
|
continue
|
|
@@ -5,6 +5,7 @@ from typing import Optional
|
|
|
5
5
|
|
|
6
6
|
from PIL import ImageDraw
|
|
7
7
|
from PIL.Image import Image
|
|
8
|
+
from pydantic import BaseModel
|
|
8
9
|
from typing_extensions import override
|
|
9
10
|
|
|
10
11
|
from docling_core.transforms.visualizer.base import BaseVisualizer
|
|
@@ -14,7 +15,16 @@ from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocume
|
|
|
14
15
|
class ReadingOrderVisualizer(BaseVisualizer):
|
|
15
16
|
"""Reading order visualizer."""
|
|
16
17
|
|
|
18
|
+
class Params(BaseModel):
|
|
19
|
+
"""Layout visualization parameters."""
|
|
20
|
+
|
|
21
|
+
show_label: bool = True
|
|
22
|
+
content_layers: set[ContentLayer] = {
|
|
23
|
+
cl for cl in ContentLayer if cl != ContentLayer.BACKGROUND
|
|
24
|
+
}
|
|
25
|
+
|
|
17
26
|
base_visualizer: Optional[BaseVisualizer] = None
|
|
27
|
+
params: Params = Params()
|
|
18
28
|
|
|
19
29
|
def _draw_arrow(
|
|
20
30
|
self,
|
|
@@ -71,7 +81,7 @@ class ReadingOrderVisualizer(BaseVisualizer):
|
|
|
71
81
|
my_images: dict[Optional[int], Image] = images or {}
|
|
72
82
|
prev_page = None
|
|
73
83
|
for elem, _ in doc.iterate_items(
|
|
74
|
-
included_content_layers=
|
|
84
|
+
included_content_layers=self.params.content_layers,
|
|
75
85
|
):
|
|
76
86
|
if not isinstance(elem, DocItem):
|
|
77
87
|
continue
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Define classes for layout visualization."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from PIL import ImageDraw
|
|
8
|
+
from PIL.Image import Image
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
from typing_extensions import override
|
|
11
|
+
|
|
12
|
+
from docling_core.transforms.visualizer.base import BaseVisualizer
|
|
13
|
+
from docling_core.types.doc.document import ContentLayer, DoclingDocument, TableItem
|
|
14
|
+
|
|
15
|
+
_log = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TableVisualizer(BaseVisualizer):
|
|
19
|
+
"""Table visualizer."""
|
|
20
|
+
|
|
21
|
+
class Params(BaseModel):
|
|
22
|
+
"""Table visualization parameters."""
|
|
23
|
+
|
|
24
|
+
# show_Label: bool = False
|
|
25
|
+
show_cells: bool = True
|
|
26
|
+
# show_rows: bool = False
|
|
27
|
+
# show_cols: bool = False
|
|
28
|
+
|
|
29
|
+
base_visualizer: Optional[BaseVisualizer] = None
|
|
30
|
+
params: Params = Params()
|
|
31
|
+
|
|
32
|
+
def _draw_table_cells(
|
|
33
|
+
self,
|
|
34
|
+
table: TableItem,
|
|
35
|
+
page_image: Image,
|
|
36
|
+
page_height: float,
|
|
37
|
+
scale_x: float,
|
|
38
|
+
scale_y: float,
|
|
39
|
+
):
|
|
40
|
+
"""Draw individual table cells."""
|
|
41
|
+
draw = ImageDraw.Draw(page_image, "RGBA")
|
|
42
|
+
|
|
43
|
+
for cell in table.data.table_cells:
|
|
44
|
+
if cell.bbox is not None:
|
|
45
|
+
|
|
46
|
+
tl_bbox = cell.bbox.to_top_left_origin(page_height=page_height)
|
|
47
|
+
|
|
48
|
+
cell_color = (256, 0, 0, 32) # Transparent black for cells
|
|
49
|
+
|
|
50
|
+
cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
|
|
51
|
+
cx0 *= scale_x
|
|
52
|
+
cx1 *= scale_x
|
|
53
|
+
cy0 *= scale_y
|
|
54
|
+
cy1 *= scale_y
|
|
55
|
+
|
|
56
|
+
draw.rectangle(
|
|
57
|
+
[(cx0, cy0), (cx1, cy1)],
|
|
58
|
+
outline=(256, 0, 0, 128),
|
|
59
|
+
fill=cell_color,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def _draw_doc_tables(
|
|
63
|
+
self,
|
|
64
|
+
doc: DoclingDocument,
|
|
65
|
+
images: Optional[dict[Optional[int], Image]] = None,
|
|
66
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
67
|
+
):
|
|
68
|
+
"""Draw the document tables."""
|
|
69
|
+
my_images: dict[Optional[int], Image] = {}
|
|
70
|
+
|
|
71
|
+
if images is not None:
|
|
72
|
+
my_images = images
|
|
73
|
+
|
|
74
|
+
if included_content_layers is None:
|
|
75
|
+
included_content_layers = {c for c in ContentLayer}
|
|
76
|
+
|
|
77
|
+
# Initialise `my_images` beforehand: sometimes, you have the
|
|
78
|
+
# page-images but no DocItems!
|
|
79
|
+
for page_nr, page in doc.pages.items():
|
|
80
|
+
page_image = doc.pages[page_nr].image
|
|
81
|
+
if page_image is None or (pil_img := page_image.pil_image) is None:
|
|
82
|
+
raise RuntimeError("Cannot visualize document without images")
|
|
83
|
+
elif page_nr not in my_images:
|
|
84
|
+
image = deepcopy(pil_img)
|
|
85
|
+
my_images[page_nr] = image
|
|
86
|
+
|
|
87
|
+
for idx, (elem, _) in enumerate(
|
|
88
|
+
doc.iterate_items(included_content_layers=included_content_layers)
|
|
89
|
+
):
|
|
90
|
+
if not isinstance(elem, TableItem):
|
|
91
|
+
continue
|
|
92
|
+
if len(elem.prov) == 0:
|
|
93
|
+
continue # Skip elements without provenances
|
|
94
|
+
|
|
95
|
+
if len(elem.prov) == 1:
|
|
96
|
+
|
|
97
|
+
page_nr = elem.prov[0].page_no
|
|
98
|
+
|
|
99
|
+
if page_nr in my_images:
|
|
100
|
+
image = my_images[page_nr]
|
|
101
|
+
|
|
102
|
+
if self.params.show_cells:
|
|
103
|
+
self._draw_table_cells(
|
|
104
|
+
table=elem,
|
|
105
|
+
page_height=doc.pages[page_nr].size.height,
|
|
106
|
+
page_image=image,
|
|
107
|
+
scale_x=image.width / doc.pages[page_nr].size.width,
|
|
108
|
+
scale_y=image.height / doc.pages[page_nr].size.height,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
else:
|
|
112
|
+
raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
|
|
113
|
+
|
|
114
|
+
else:
|
|
115
|
+
_log.error("Can not yet visualise tables with multiple provenances")
|
|
116
|
+
|
|
117
|
+
return my_images
|
|
118
|
+
|
|
119
|
+
@override
|
|
120
|
+
def get_visualization(
|
|
121
|
+
self,
|
|
122
|
+
*,
|
|
123
|
+
doc: DoclingDocument,
|
|
124
|
+
**kwargs,
|
|
125
|
+
) -> dict[Optional[int], Image]:
|
|
126
|
+
"""Get visualization of the document as images by page."""
|
|
127
|
+
base_images = (
|
|
128
|
+
self.base_visualizer.get_visualization(doc=doc, **kwargs)
|
|
129
|
+
if self.base_visualizer
|
|
130
|
+
else None
|
|
131
|
+
)
|
|
132
|
+
return self._draw_doc_tables(
|
|
133
|
+
doc=doc,
|
|
134
|
+
images=base_images,
|
|
135
|
+
)
|
|
@@ -623,6 +623,7 @@ class ContentLayer(str, Enum):
|
|
|
623
623
|
|
|
624
624
|
BODY = "body"
|
|
625
625
|
FURNITURE = "furniture"
|
|
626
|
+
BACKGROUND = "background"
|
|
626
627
|
|
|
627
628
|
|
|
628
629
|
DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
|
|
@@ -860,6 +861,7 @@ class TextItem(DocItem):
|
|
|
860
861
|
DocItemLabel.PARAGRAPH,
|
|
861
862
|
DocItemLabel.REFERENCE,
|
|
862
863
|
DocItemLabel.TEXT,
|
|
864
|
+
DocItemLabel.EMPTY_VALUE,
|
|
863
865
|
]
|
|
864
866
|
|
|
865
867
|
orig: str # untreated representation
|
|
@@ -2867,23 +2869,46 @@ class DoclingDocument(BaseModel):
|
|
|
2867
2869
|
|
|
2868
2870
|
def print_element_tree(self):
|
|
2869
2871
|
"""Print_element_tree."""
|
|
2870
|
-
for ix, (item, level) in enumerate(
|
|
2872
|
+
for ix, (item, level) in enumerate(
|
|
2873
|
+
self.iterate_items(
|
|
2874
|
+
with_groups=True,
|
|
2875
|
+
traverse_pictures=True,
|
|
2876
|
+
included_content_layers={cl for cl in ContentLayer},
|
|
2877
|
+
)
|
|
2878
|
+
):
|
|
2871
2879
|
if isinstance(item, GroupItem):
|
|
2872
2880
|
print(
|
|
2873
2881
|
" " * level,
|
|
2874
2882
|
f"{ix}: {item.label.value} with name={item.name}",
|
|
2875
2883
|
)
|
|
2884
|
+
elif isinstance(item, TextItem):
|
|
2885
|
+
print(
|
|
2886
|
+
" " * level,
|
|
2887
|
+
f"{ix}: {item.label.value}: {item.text[:min(len(item.text), 100)]}",
|
|
2888
|
+
)
|
|
2889
|
+
|
|
2876
2890
|
elif isinstance(item, DocItem):
|
|
2877
2891
|
print(" " * level, f"{ix}: {item.label.value}")
|
|
2878
2892
|
|
|
2879
2893
|
def export_to_element_tree(self) -> str:
|
|
2880
2894
|
"""Export_to_element_tree."""
|
|
2881
2895
|
texts = []
|
|
2882
|
-
for ix, (item, level) in enumerate(
|
|
2896
|
+
for ix, (item, level) in enumerate(
|
|
2897
|
+
self.iterate_items(
|
|
2898
|
+
with_groups=True,
|
|
2899
|
+
traverse_pictures=True,
|
|
2900
|
+
included_content_layers={cl for cl in ContentLayer},
|
|
2901
|
+
)
|
|
2902
|
+
):
|
|
2883
2903
|
if isinstance(item, GroupItem):
|
|
2884
2904
|
texts.append(
|
|
2885
2905
|
" " * level + f"{ix}: {item.label.value} with name={item.name}"
|
|
2886
2906
|
)
|
|
2907
|
+
elif isinstance(item, TextItem):
|
|
2908
|
+
texts.append(
|
|
2909
|
+
" " * level
|
|
2910
|
+
+ f"{ix}: {item.label.value}: {item.text[:min(len(item.text), 100)]}"
|
|
2911
|
+
)
|
|
2887
2912
|
elif isinstance(item, DocItem):
|
|
2888
2913
|
texts.append(" " * level + f"{ix}: {item.label.value}")
|
|
2889
2914
|
|
|
@@ -27,6 +27,9 @@ class DocItemLabel(str, Enum):
|
|
|
27
27
|
KEY_VALUE_REGION = "key_value_region"
|
|
28
28
|
GRADING_SCALE = "grading_scale" # for elements in forms, questionaires representing a grading scale
|
|
29
29
|
# e.g. [strongly disagree | ... | ... | strongly agree]
|
|
30
|
+
# e.g. ★★☆☆☆
|
|
31
|
+
HANDWRITTEN_TEXT = "handwritten_text"
|
|
32
|
+
EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms
|
|
30
33
|
|
|
31
34
|
# Additional labels for markup-based formats (e.g. HTML, Word)
|
|
32
35
|
PARAGRAPH = "paragraph"
|
|
@@ -60,6 +63,9 @@ class DocItemLabel(str, Enum):
|
|
|
60
63
|
DocItemLabel.KEY_VALUE_REGION: (183, 65, 14),
|
|
61
64
|
DocItemLabel.PARAGRAPH: (255, 255, 153),
|
|
62
65
|
DocItemLabel.REFERENCE: (176, 224, 230),
|
|
66
|
+
DocItemLabel.GRADING_SCALE: (255, 204, 204),
|
|
67
|
+
DocItemLabel.HANDWRITTEN_TEXT: (204, 255, 204),
|
|
68
|
+
DocItemLabel.EMPTY_VALUE: (220, 220, 220),
|
|
63
69
|
}
|
|
64
70
|
return color_map.get(label, (0, 0, 0))
|
|
65
71
|
|
|
@@ -166,7 +172,6 @@ class GraphCellLabel(str, Enum):
|
|
|
166
172
|
KEY = "key" # used to designate a key (label) of a key-value element
|
|
167
173
|
VALUE = "value" # Data value with or without explicit Key, but filled in,
|
|
168
174
|
# e.g. telephone number, address, quantity, name, date
|
|
169
|
-
EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms
|
|
170
175
|
CHECKBOX = "checkbox"
|
|
171
176
|
|
|
172
177
|
def __str__(self):
|
|
@@ -179,7 +184,6 @@ class GraphCellLabel(str, Enum):
|
|
|
179
184
|
color_map = {
|
|
180
185
|
GraphCellLabel.KEY: (255, 0, 0),
|
|
181
186
|
GraphCellLabel.VALUE: (0, 255, 0),
|
|
182
|
-
GraphCellLabel.EMPTY_VALUE: (0, 0, 255),
|
|
183
187
|
}
|
|
184
188
|
return color_map.get(label, (0, 0, 0))
|
|
185
189
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.36.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -45,6 +45,7 @@ docling_core/transforms/visualizer/__init__.py
|
|
|
45
45
|
docling_core/transforms/visualizer/base.py
|
|
46
46
|
docling_core/transforms/visualizer/layout_visualizer.py
|
|
47
47
|
docling_core/transforms/visualizer/reading_order_visualizer.py
|
|
48
|
+
docling_core/transforms/visualizer/table_visualizer.py
|
|
48
49
|
docling_core/types/__init__.py
|
|
49
50
|
docling_core/types/base.py
|
|
50
51
|
docling_core/types/doc/__init__.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.36.0" # DO NOT EDIT, updated automatically
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
license-files = ["LICENSE"]
|
|
@@ -2,6 +2,7 @@ from pathlib import Path
|
|
|
2
2
|
|
|
3
3
|
import PIL.Image
|
|
4
4
|
|
|
5
|
+
from docling_core.transforms.visualizer.table_visualizer import TableVisualizer
|
|
5
6
|
from docling_core.types.doc.document import DoclingDocument
|
|
6
7
|
|
|
7
8
|
from .test_data_gen_flag import GEN_TEST_DATA
|
|
@@ -52,3 +53,16 @@ def test_doc_visualization_no_label():
|
|
|
52
53
|
exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_viz_wout_lbl_p{k}.png",
|
|
53
54
|
actual=viz_pages[k],
|
|
54
55
|
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_table_visualization_no_label():
|
|
59
|
+
src = Path("./test/data/doc/2408.09869v3_enriched.json")
|
|
60
|
+
doc = DoclingDocument.load_from_json(src)
|
|
61
|
+
|
|
62
|
+
visualizer = TableVisualizer()
|
|
63
|
+
viz_pages = visualizer.get_visualization(doc=doc)
|
|
64
|
+
|
|
65
|
+
verify(
|
|
66
|
+
exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_table_viz_wout_lbl_p5.png",
|
|
67
|
+
actual=viz_pages[5],
|
|
68
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.34.2 → docling_core-2.36.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.34.2 → docling_core-2.36.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/tokenizer/__init__.py
RENAMED
|
File without changes
|
{docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/tokenizer/base.py
RENAMED
|
File without changes
|
{docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/tokenizer/huggingface.py
RENAMED
|
File without changes
|
{docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/chunker/tokenizer/openai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.34.2 → docling_core-2.36.0}/docling_core/transforms/serializer/html_styles.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|