docling-core 2.43.1__tar.gz → 2.44.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.43.1 → docling_core-2.44.1}/PKG-INFO +1 -1
- docling_core-2.44.1/docling_core/transforms/visualizer/key_value_visualizer.py +217 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/doc/document.py +59 -22
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core.egg-info/SOURCES.txt +1 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/pyproject.toml +1 -1
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_docling_doc.py +14 -13
- {docling_core-2.43.1 → docling_core-2.44.1}/LICENSE +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/README.md +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/cli/view.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/py.typed +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/search/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/search/mapping.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/search/meta.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/search/package.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/page_chunker.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/serializer/base.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/serializer/common.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/serializer/doctags.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/serializer/html.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/serializer/markdown.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/base.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/utils/alias.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/utils/file.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/utils/validate.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core/utils/validators.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/setup.cfg +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_base.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_collection.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_doc_base.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_doc_schema.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_doctags_load.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_nlp_qa.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_page.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_page_chunker.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_rec_schema.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_search_meta.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_serialization.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_utils.py +0 -0
- {docling_core-2.43.1 → docling_core-2.44.1}/test/test_visualization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.44.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""Key‑value visualizer overlaying key/value cells and their links on page images.
|
|
2
|
+
|
|
3
|
+
This module complements :py:class:`layout_visualizer.LayoutVisualizer` by drawing
|
|
4
|
+
*key* and *value* cells plus the directed links between them. It can be stacked
|
|
5
|
+
on top of any other :py:class:`BaseVisualizer` – e.g. first draw the general
|
|
6
|
+
layout, then add the key‑value layer.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from copy import deepcopy
|
|
10
|
+
from typing import Optional, Union
|
|
11
|
+
|
|
12
|
+
from PIL import ImageDraw, ImageFont
|
|
13
|
+
from PIL.Image import Image
|
|
14
|
+
from PIL.ImageFont import FreeTypeFont
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
from typing_extensions import override
|
|
17
|
+
|
|
18
|
+
from docling_core.transforms.visualizer.base import BaseVisualizer
|
|
19
|
+
from docling_core.types.doc.document import ContentLayer, DoclingDocument
|
|
20
|
+
from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel
|
|
21
|
+
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# Helper functions / constants
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
# Semi‑transparent RGBA colours for key / value cells and their connecting link
|
|
27
|
+
_KEY_FILL = (0, 170, 0, 70) # greenish
|
|
28
|
+
_VALUE_FILL = (0, 0, 200, 70) # bluish
|
|
29
|
+
_LINK_COLOUR = (255, 0, 0, 255) # red line (solid)
|
|
30
|
+
|
|
31
|
+
_LABEL_TXT_COLOUR = (0, 0, 0, 255)
|
|
32
|
+
_LABEL_BG_COLOUR = (255, 255, 255, 180) # semi‑transparent white
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class KeyValueVisualizer(BaseVisualizer):
|
|
36
|
+
"""Draw key/value graphs stored in :py:attr:`DoclingDocument.key_value_items`."""
|
|
37
|
+
|
|
38
|
+
class Params(BaseModel):
|
|
39
|
+
"""Parameters for KeyValueVisualizer controlling label and cell id display, and content layers to visualize."""
|
|
40
|
+
|
|
41
|
+
show_label: bool = True # draw cell text close to bbox
|
|
42
|
+
show_cell_id: bool = False # annotate each rectangle with its cell_id
|
|
43
|
+
content_layers: set[ContentLayer] = {cl for cl in ContentLayer}
|
|
44
|
+
|
|
45
|
+
base_visualizer: Optional[BaseVisualizer] = None
|
|
46
|
+
params: Params = Params()
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------
|
|
49
|
+
# Internal helpers
|
|
50
|
+
# ---------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
def _cell_fill(self, label: GraphCellLabel) -> tuple[int, int, int, int]:
|
|
53
|
+
"""Return RGBA fill colour depending on *label*."""
|
|
54
|
+
return _KEY_FILL if label == GraphCellLabel.KEY else _VALUE_FILL
|
|
55
|
+
|
|
56
|
+
def _draw_key_value_layer(
|
|
57
|
+
self,
|
|
58
|
+
*,
|
|
59
|
+
image: Image,
|
|
60
|
+
doc: DoclingDocument,
|
|
61
|
+
page_no: int,
|
|
62
|
+
scale_x: float,
|
|
63
|
+
scale_y: float,
|
|
64
|
+
) -> None:
|
|
65
|
+
"""Draw every key‑value graph that has cells on *page_no* onto *image*."""
|
|
66
|
+
draw = ImageDraw.Draw(image, "RGBA")
|
|
67
|
+
# Choose a small truetype font if available, otherwise default bitmap font
|
|
68
|
+
font: Union[ImageFont.ImageFont, FreeTypeFont]
|
|
69
|
+
try:
|
|
70
|
+
font = ImageFont.truetype("arial.ttf", 12)
|
|
71
|
+
except OSError:
|
|
72
|
+
font = ImageFont.load_default()
|
|
73
|
+
|
|
74
|
+
for kv_item in doc.key_value_items:
|
|
75
|
+
cell_dict = {cell.cell_id: cell for cell in kv_item.graph.cells}
|
|
76
|
+
|
|
77
|
+
# ------------------------------------------------------------------
|
|
78
|
+
# First draw cells (rectangles + optional labels)
|
|
79
|
+
# ------------------------------------------------------------------
|
|
80
|
+
for cell in cell_dict.values():
|
|
81
|
+
if cell.prov is None or cell.prov.page_no != page_no:
|
|
82
|
+
continue # skip cells not on this page or without bbox
|
|
83
|
+
|
|
84
|
+
tl_bbox = cell.prov.bbox.to_top_left_origin(
|
|
85
|
+
page_height=doc.pages[page_no].size.height
|
|
86
|
+
)
|
|
87
|
+
x0, y0, x1, y1 = tl_bbox.as_tuple()
|
|
88
|
+
x0 *= scale_x
|
|
89
|
+
x1 *= scale_x
|
|
90
|
+
y0 *= scale_y
|
|
91
|
+
y1 *= scale_y
|
|
92
|
+
fill_rgba = self._cell_fill(cell.label)
|
|
93
|
+
|
|
94
|
+
draw.rectangle(
|
|
95
|
+
[(x0, y0), (x1, y1)],
|
|
96
|
+
outline=fill_rgba[:-1] + (255,),
|
|
97
|
+
fill=fill_rgba,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if self.params.show_label:
|
|
101
|
+
txt_parts = []
|
|
102
|
+
if self.params.show_cell_id:
|
|
103
|
+
txt_parts.append(str(cell.cell_id))
|
|
104
|
+
txt_parts.append(cell.text)
|
|
105
|
+
label_text = " | ".join(txt_parts)
|
|
106
|
+
|
|
107
|
+
tbx = draw.textbbox((x0, y0), label_text, font=font)
|
|
108
|
+
pad = 2
|
|
109
|
+
draw.rectangle(
|
|
110
|
+
[(tbx[0] - pad, tbx[1] - pad), (tbx[2] + pad, tbx[3] + pad)],
|
|
111
|
+
fill=_LABEL_BG_COLOUR,
|
|
112
|
+
)
|
|
113
|
+
draw.text((x0, y0), label_text, font=font, fill=_LABEL_TXT_COLOUR)
|
|
114
|
+
|
|
115
|
+
# ------------------------------------------------------------------
|
|
116
|
+
# Then draw links (after rectangles so they appear on top)
|
|
117
|
+
# ------------------------------------------------------------------
|
|
118
|
+
for link in kv_item.graph.links:
|
|
119
|
+
if link.label != GraphLinkLabel.TO_VALUE:
|
|
120
|
+
# Future‑proof: ignore other link types silently
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
src_cell = cell_dict.get(link.source_cell_id)
|
|
124
|
+
tgt_cell = cell_dict.get(link.target_cell_id)
|
|
125
|
+
if src_cell is None or tgt_cell is None:
|
|
126
|
+
continue
|
|
127
|
+
if (
|
|
128
|
+
src_cell.prov is None
|
|
129
|
+
or tgt_cell.prov is None
|
|
130
|
+
or src_cell.prov.page_no != page_no
|
|
131
|
+
or tgt_cell.prov.page_no != page_no
|
|
132
|
+
):
|
|
133
|
+
continue # only draw if both ends are on this page
|
|
134
|
+
|
|
135
|
+
def _centre(bbox):
|
|
136
|
+
tl = bbox.to_top_left_origin(
|
|
137
|
+
page_height=doc.pages[page_no].size.height
|
|
138
|
+
)
|
|
139
|
+
l, t, r, b = tl.as_tuple()
|
|
140
|
+
return ((l + r) / 2 * scale_x, (t + b) / 2 * scale_y)
|
|
141
|
+
|
|
142
|
+
src_xy = _centre(src_cell.prov.bbox)
|
|
143
|
+
tgt_xy = _centre(tgt_cell.prov.bbox)
|
|
144
|
+
|
|
145
|
+
draw.line([src_xy, tgt_xy], fill=_LINK_COLOUR, width=2)
|
|
146
|
+
|
|
147
|
+
# draw a small arrow‑head by rendering a short orthogonal line
|
|
148
|
+
# segment; exact geometry is not critical for visual inspection
|
|
149
|
+
arrow_len = 6
|
|
150
|
+
dx = tgt_xy[0] - src_xy[0]
|
|
151
|
+
dy = tgt_xy[1] - src_xy[1]
|
|
152
|
+
length = (dx**2 + dy**2) ** 0.5 or 1.0
|
|
153
|
+
ux, uy = dx / length, dy / length
|
|
154
|
+
# perpendicular vector
|
|
155
|
+
px, py = -uy, ux
|
|
156
|
+
# two points forming the arrow head triangle base
|
|
157
|
+
head_base_left = (
|
|
158
|
+
tgt_xy[0] - ux * arrow_len - px * arrow_len / 2,
|
|
159
|
+
tgt_xy[1] - uy * arrow_len - py * arrow_len / 2,
|
|
160
|
+
)
|
|
161
|
+
head_base_right = (
|
|
162
|
+
tgt_xy[0] - ux * arrow_len + px * arrow_len / 2,
|
|
163
|
+
tgt_xy[1] - uy * arrow_len + py * arrow_len / 2,
|
|
164
|
+
)
|
|
165
|
+
draw.polygon(
|
|
166
|
+
[tgt_xy, head_base_left, head_base_right], fill=_LINK_COLOUR
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# ---------------------------------------------------------------------
|
|
170
|
+
# Public API – BaseVisualizer implementation
|
|
171
|
+
# ---------------------------------------------------------------------
|
|
172
|
+
|
|
173
|
+
@override
|
|
174
|
+
def get_visualization(
|
|
175
|
+
self,
|
|
176
|
+
*,
|
|
177
|
+
doc: DoclingDocument,
|
|
178
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
179
|
+
**kwargs,
|
|
180
|
+
) -> dict[Optional[int], Image]:
|
|
181
|
+
"""Return page‑wise images with key/value overlay (incl. base layer)."""
|
|
182
|
+
base_images = (
|
|
183
|
+
self.base_visualizer.get_visualization(
|
|
184
|
+
doc=doc, included_content_layers=included_content_layers, **kwargs
|
|
185
|
+
)
|
|
186
|
+
if self.base_visualizer
|
|
187
|
+
else None
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
if included_content_layers is None:
|
|
191
|
+
included_content_layers = {cl for cl in ContentLayer}
|
|
192
|
+
|
|
193
|
+
images: dict[Optional[int], Image] = {}
|
|
194
|
+
|
|
195
|
+
# Ensure we have page images to draw on
|
|
196
|
+
for page_nr, page in doc.pages.items():
|
|
197
|
+
base_img = (base_images or {}).get(page_nr)
|
|
198
|
+
if base_img is None:
|
|
199
|
+
if page.image is None or (pil_img := page.image.pil_image) is None:
|
|
200
|
+
raise RuntimeError("Cannot visualize document without page images")
|
|
201
|
+
base_img = deepcopy(pil_img)
|
|
202
|
+
images[page_nr] = base_img
|
|
203
|
+
|
|
204
|
+
# Overlay key‑value content
|
|
205
|
+
for page_nr, img in images.items(): # type: ignore
|
|
206
|
+
assert isinstance(page_nr, int)
|
|
207
|
+
scale_x = img.width / doc.pages[page_nr].size.width
|
|
208
|
+
scale_y = img.height / doc.pages[page_nr].size.height
|
|
209
|
+
self._draw_key_value_layer(
|
|
210
|
+
image=img,
|
|
211
|
+
doc=doc,
|
|
212
|
+
page_no=page_nr,
|
|
213
|
+
scale_x=scale_x,
|
|
214
|
+
scale_y=scale_y,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
return images
|
|
@@ -1373,11 +1373,12 @@ class PictureItem(FloatingItem):
|
|
|
1373
1373
|
) # Encode to Base64 and decode to string
|
|
1374
1374
|
return img_base64
|
|
1375
1375
|
|
|
1376
|
-
|
|
1376
|
+
@staticmethod
|
|
1377
|
+
def _image_to_hexhash(img: Optional[PILImage.Image]) -> Optional[str]:
|
|
1377
1378
|
"""Hexash from the image."""
|
|
1378
|
-
if
|
|
1379
|
+
if img is not None:
|
|
1379
1380
|
# Convert the image to raw bytes
|
|
1380
|
-
image_bytes =
|
|
1381
|
+
image_bytes = img.tobytes()
|
|
1381
1382
|
|
|
1382
1383
|
# Create a hash object (e.g., SHA-256)
|
|
1383
1384
|
hasher = hashlib.sha256(usedforsecurity=False)
|
|
@@ -4116,16 +4117,10 @@ class DoclingDocument(BaseModel):
|
|
|
4116
4117
|
if image_dir.is_dir():
|
|
4117
4118
|
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
|
|
4118
4119
|
if isinstance(item, PictureItem):
|
|
4120
|
+
img = item.get_image(doc=self)
|
|
4121
|
+
if img is not None:
|
|
4119
4122
|
|
|
4120
|
-
|
|
4121
|
-
item.image is not None
|
|
4122
|
-
and isinstance(item.image.uri, AnyUrl)
|
|
4123
|
-
and item.image.uri.scheme == "data"
|
|
4124
|
-
and item.image.pil_image is not None
|
|
4125
|
-
):
|
|
4126
|
-
img = item.image.pil_image
|
|
4127
|
-
|
|
4128
|
-
hexhash = item._image_to_hexhash()
|
|
4123
|
+
hexhash = PictureItem._image_to_hexhash(img)
|
|
4129
4124
|
|
|
4130
4125
|
# loc_path = image_dir / f"image_{img_count:06}.png"
|
|
4131
4126
|
if hexhash is not None:
|
|
@@ -4140,6 +4135,11 @@ class DoclingDocument(BaseModel):
|
|
|
4140
4135
|
else:
|
|
4141
4136
|
obj_path = loc_path
|
|
4142
4137
|
|
|
4138
|
+
if item.image is None:
|
|
4139
|
+
scale = img.size[0] / item.prov[0].bbox.width
|
|
4140
|
+
item.image = ImageRef.from_pil(
|
|
4141
|
+
image=img, dpi=round(72 * scale)
|
|
4142
|
+
)
|
|
4143
4143
|
item.image.uri = Path(obj_path)
|
|
4144
4144
|
|
|
4145
4145
|
# if item.image._pil is not None:
|
|
@@ -4539,6 +4539,8 @@ class DoclingDocument(BaseModel):
|
|
|
4539
4539
|
reference_path = None
|
|
4540
4540
|
else:
|
|
4541
4541
|
reference_path = filename.parent
|
|
4542
|
+
artifacts_dir = reference_path / artifacts_dir
|
|
4543
|
+
|
|
4542
4544
|
return artifacts_dir, reference_path
|
|
4543
4545
|
|
|
4544
4546
|
def _make_copy_with_refmode(
|
|
@@ -5543,8 +5545,27 @@ class DoclingDocument(BaseModel):
|
|
|
5543
5545
|
self,
|
|
5544
5546
|
show_label: bool = True,
|
|
5545
5547
|
show_branch_numbering: bool = False,
|
|
5548
|
+
viz_mode: Literal["reading_order", "key_value"] = "reading_order",
|
|
5549
|
+
show_cell_id: bool = False,
|
|
5546
5550
|
) -> dict[Optional[int], PILImage.Image]:
|
|
5547
|
-
"""Get visualization of the document as images by page.
|
|
5551
|
+
"""Get visualization of the document as images by page.
|
|
5552
|
+
|
|
5553
|
+
:param show_label: Show labels on elements (applies to all visualizers).
|
|
5554
|
+
:type show_label: bool
|
|
5555
|
+
:param show_branch_numbering: Show branch numbering (reading order visualizer only).
|
|
5556
|
+
:type show_branch_numbering: bool
|
|
5557
|
+
:param visualizer: Which visualizer to use. One of 'reading_order' (default), 'key_value'.
|
|
5558
|
+
:type visualizer: str
|
|
5559
|
+
:param show_cell_id: Show cell IDs (key value visualizer only).
|
|
5560
|
+
:type show_cell_id: bool
|
|
5561
|
+
|
|
5562
|
+
:returns: Dictionary mapping page numbers to PIL images.
|
|
5563
|
+
:rtype: dict[Optional[int], PILImage.Image]
|
|
5564
|
+
"""
|
|
5565
|
+
from docling_core.transforms.visualizer.base import BaseVisualizer
|
|
5566
|
+
from docling_core.transforms.visualizer.key_value_visualizer import (
|
|
5567
|
+
KeyValueVisualizer,
|
|
5568
|
+
)
|
|
5548
5569
|
from docling_core.transforms.visualizer.layout_visualizer import (
|
|
5549
5570
|
LayoutVisualizer,
|
|
5550
5571
|
)
|
|
@@ -5552,18 +5573,34 @@ class DoclingDocument(BaseModel):
|
|
|
5552
5573
|
ReadingOrderVisualizer,
|
|
5553
5574
|
)
|
|
5554
5575
|
|
|
5555
|
-
|
|
5556
|
-
|
|
5557
|
-
|
|
5576
|
+
visualizer_obj: BaseVisualizer
|
|
5577
|
+
if viz_mode == "reading_order":
|
|
5578
|
+
visualizer_obj = ReadingOrderVisualizer(
|
|
5579
|
+
base_visualizer=LayoutVisualizer(
|
|
5580
|
+
params=LayoutVisualizer.Params(
|
|
5581
|
+
show_label=show_label,
|
|
5582
|
+
),
|
|
5583
|
+
),
|
|
5584
|
+
params=ReadingOrderVisualizer.Params(
|
|
5585
|
+
show_branch_numbering=show_branch_numbering,
|
|
5586
|
+
),
|
|
5587
|
+
)
|
|
5588
|
+
elif viz_mode == "key_value":
|
|
5589
|
+
visualizer_obj = KeyValueVisualizer(
|
|
5590
|
+
base_visualizer=LayoutVisualizer(
|
|
5591
|
+
params=LayoutVisualizer.Params(
|
|
5592
|
+
show_label=show_label,
|
|
5593
|
+
),
|
|
5594
|
+
),
|
|
5595
|
+
params=KeyValueVisualizer.Params(
|
|
5558
5596
|
show_label=show_label,
|
|
5597
|
+
show_cell_id=show_cell_id,
|
|
5559
5598
|
),
|
|
5560
|
-
)
|
|
5561
|
-
|
|
5562
|
-
|
|
5563
|
-
),
|
|
5564
|
-
)
|
|
5565
|
-
images = visualizer.get_visualization(doc=self)
|
|
5599
|
+
)
|
|
5600
|
+
else:
|
|
5601
|
+
raise ValueError(f"Unknown visualization mode: {viz_mode}")
|
|
5566
5602
|
|
|
5603
|
+
images = visualizer_obj.get_visualization(doc=self)
|
|
5567
5604
|
return images
|
|
5568
5605
|
|
|
5569
5606
|
@field_validator("version")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.44.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -44,6 +44,7 @@ docling_core/transforms/serializer/html_styles.py
|
|
|
44
44
|
docling_core/transforms/serializer/markdown.py
|
|
45
45
|
docling_core/transforms/visualizer/__init__.py
|
|
46
46
|
docling_core/transforms/visualizer/base.py
|
|
47
|
+
docling_core/transforms/visualizer/key_value_visualizer.py
|
|
47
48
|
docling_core/transforms/visualizer/layout_visualizer.py
|
|
48
49
|
docling_core/transforms/visualizer/reading_order_visualizer.py
|
|
49
50
|
docling_core/transforms/visualizer/table_visualizer.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.44.1" # DO NOT EDIT, updated automatically
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
license-files = ["LICENSE"]
|
|
@@ -1442,10 +1442,11 @@ def test_save_to_disk():
|
|
|
1442
1442
|
|
|
1443
1443
|
doc: DoclingDocument = _construct_doc()
|
|
1444
1444
|
|
|
1445
|
-
|
|
1445
|
+
test_dir = Path("./test/data/doc")
|
|
1446
|
+
image_dir = Path("constructed_images/") # will be relative to test_dir
|
|
1446
1447
|
|
|
1447
1448
|
doc_with_references = doc._with_pictures_refs(
|
|
1448
|
-
image_dir=
|
|
1449
|
+
image_dir=(test_dir / image_dir),
|
|
1449
1450
|
page_no=None,
|
|
1450
1451
|
)
|
|
1451
1452
|
|
|
@@ -1455,19 +1456,19 @@ def test_save_to_disk():
|
|
|
1455
1456
|
|
|
1456
1457
|
### MarkDown
|
|
1457
1458
|
|
|
1458
|
-
filename =
|
|
1459
|
+
filename = test_dir / "constructed_doc.placeholder.md"
|
|
1459
1460
|
doc.save_as_markdown(
|
|
1460
1461
|
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.PLACEHOLDER
|
|
1461
1462
|
)
|
|
1462
1463
|
_verify_saved_output(filename=filename, paths=paths)
|
|
1463
1464
|
|
|
1464
|
-
filename =
|
|
1465
|
+
filename = test_dir / "constructed_doc.embedded.md"
|
|
1465
1466
|
doc.save_as_markdown(
|
|
1466
1467
|
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.EMBEDDED
|
|
1467
1468
|
)
|
|
1468
1469
|
_verify_saved_output(filename=filename, paths=paths)
|
|
1469
1470
|
|
|
1470
|
-
filename =
|
|
1471
|
+
filename = test_dir / "constructed_doc.referenced.md"
|
|
1471
1472
|
doc.save_as_markdown(
|
|
1472
1473
|
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.REFERENCED
|
|
1473
1474
|
)
|
|
@@ -1475,19 +1476,19 @@ def test_save_to_disk():
|
|
|
1475
1476
|
|
|
1476
1477
|
### HTML
|
|
1477
1478
|
|
|
1478
|
-
filename =
|
|
1479
|
+
filename = test_dir / "constructed_doc.placeholder.html"
|
|
1479
1480
|
doc.save_as_html(
|
|
1480
1481
|
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.PLACEHOLDER
|
|
1481
1482
|
)
|
|
1482
1483
|
_verify_saved_output(filename=filename, paths=paths)
|
|
1483
1484
|
|
|
1484
|
-
filename =
|
|
1485
|
+
filename = test_dir / "constructed_doc.embedded.html"
|
|
1485
1486
|
doc.save_as_html(
|
|
1486
1487
|
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.EMBEDDED
|
|
1487
1488
|
)
|
|
1488
1489
|
_verify_saved_output(filename=filename, paths=paths)
|
|
1489
1490
|
|
|
1490
|
-
filename =
|
|
1491
|
+
filename = test_dir / "constructed_doc.referenced.html"
|
|
1491
1492
|
doc.save_as_html(
|
|
1492
1493
|
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.REFERENCED
|
|
1493
1494
|
)
|
|
@@ -1495,13 +1496,13 @@ def test_save_to_disk():
|
|
|
1495
1496
|
|
|
1496
1497
|
### Document Tokens
|
|
1497
1498
|
|
|
1498
|
-
filename =
|
|
1499
|
+
filename = test_dir / "constructed_doc.dt"
|
|
1499
1500
|
doc.save_as_doctags(filename=filename)
|
|
1500
1501
|
_verify_saved_output(filename=filename, paths=paths)
|
|
1501
1502
|
|
|
1502
1503
|
### JSON
|
|
1503
1504
|
|
|
1504
|
-
filename =
|
|
1505
|
+
filename = test_dir / "constructed_doc.embedded.json"
|
|
1505
1506
|
doc.save_as_json(
|
|
1506
1507
|
filename=filename,
|
|
1507
1508
|
artifacts_dir=image_dir,
|
|
@@ -1512,7 +1513,7 @@ def test_save_to_disk():
|
|
|
1512
1513
|
doc_emb_loaded = DoclingDocument.load_from_json(filename)
|
|
1513
1514
|
_verify_loaded_output(filename=filename, pred=doc_emb_loaded)
|
|
1514
1515
|
|
|
1515
|
-
filename =
|
|
1516
|
+
filename = test_dir / "constructed_doc.referenced.json"
|
|
1516
1517
|
doc.save_as_json(
|
|
1517
1518
|
filename=filename,
|
|
1518
1519
|
artifacts_dir=image_dir,
|
|
@@ -1525,7 +1526,7 @@ def test_save_to_disk():
|
|
|
1525
1526
|
|
|
1526
1527
|
### YAML
|
|
1527
1528
|
|
|
1528
|
-
filename =
|
|
1529
|
+
filename = test_dir / "constructed_doc.embedded.yaml"
|
|
1529
1530
|
doc.save_as_yaml(
|
|
1530
1531
|
filename=filename,
|
|
1531
1532
|
artifacts_dir=image_dir,
|
|
@@ -1533,7 +1534,7 @@ def test_save_to_disk():
|
|
|
1533
1534
|
)
|
|
1534
1535
|
_verify_saved_output(filename=filename, paths=paths)
|
|
1535
1536
|
|
|
1536
|
-
filename =
|
|
1537
|
+
filename = test_dir / "constructed_doc.referenced.yaml"
|
|
1537
1538
|
doc.save_as_yaml(
|
|
1538
1539
|
filename=filename,
|
|
1539
1540
|
artifacts_dir=image_dir,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.43.1 → docling_core-2.44.1}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.43.1 → docling_core-2.44.1}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
{docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/tokenizer/__init__.py
RENAMED
|
File without changes
|
{docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/tokenizer/base.py
RENAMED
|
File without changes
|
{docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/tokenizer/huggingface.py
RENAMED
|
File without changes
|
{docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/chunker/tokenizer/openai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/serializer/html_styles.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/visualizer/layout_visualizer.py
RENAMED
|
File without changes
|
|
File without changes
|
{docling_core-2.43.1 → docling_core-2.44.1}/docling_core/transforms/visualizer/table_visualizer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|